@machinespirits/eval 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +161 -0
- package/config/eval-settings.yaml +18 -0
- package/config/evaluation-rubric-learner.yaml +277 -0
- package/config/evaluation-rubric.yaml +613 -0
- package/config/interaction-eval-scenarios.yaml +93 -50
- package/config/learner-agents.yaml +124 -193
- package/config/machinespirits-eval.code-workspace +11 -0
- package/config/providers.yaml +60 -0
- package/config/suggestion-scenarios.yaml +1399 -0
- package/config/tutor-agents.yaml +716 -0
- package/docs/EVALUATION-VARIABLES.md +589 -0
- package/docs/REPLICATION-PLAN.md +577 -0
- package/index.js +15 -6
- package/package.json +16 -22
- package/routes/evalRoutes.js +88 -36
- package/scripts/analyze-judge-reliability.js +401 -0
- package/scripts/analyze-run.js +97 -0
- package/scripts/analyze-run.mjs +282 -0
- package/scripts/analyze-validation-failures.js +141 -0
- package/scripts/check-run.mjs +17 -0
- package/scripts/code-impasse-strategies.js +1132 -0
- package/scripts/compare-runs.js +44 -0
- package/scripts/compare-suggestions.js +80 -0
- package/scripts/compare-transformation.js +116 -0
- package/scripts/dig-into-run.js +158 -0
- package/scripts/eval-cli.js +2626 -0
- package/scripts/generate-paper-figures.py +452 -0
- package/scripts/qualitative-analysis-ai.js +1313 -0
- package/scripts/qualitative-analysis.js +688 -0
- package/scripts/seed-db.js +87 -0
- package/scripts/show-failed-suggestions.js +64 -0
- package/scripts/validate-content.js +192 -0
- package/server.js +3 -2
- package/services/__tests__/evalConfigLoader.test.js +338 -0
- package/services/anovaStats.js +499 -0
- package/services/contentResolver.js +407 -0
- package/services/dialogueTraceAnalyzer.js +454 -0
- package/services/evalConfigLoader.js +625 -0
- package/services/evaluationRunner.js +2171 -270
- package/services/evaluationStore.js +564 -29
- package/services/learnerConfigLoader.js +75 -5
- package/services/learnerRubricEvaluator.js +284 -0
- package/services/learnerTutorInteractionEngine.js +375 -0
- package/services/processUtils.js +18 -0
- package/services/progressLogger.js +98 -0
- package/services/promptRecommendationService.js +31 -26
- package/services/promptRewriter.js +427 -0
- package/services/rubricEvaluator.js +543 -70
- package/services/streamingReporter.js +104 -0
- package/services/turnComparisonAnalyzer.js +494 -0
- package/components/MobileEvalDashboard.tsx +0 -267
- package/components/comparison/DeltaAnalysisTable.tsx +0 -137
- package/components/comparison/ProfileComparisonCard.tsx +0 -176
- package/components/comparison/RecognitionABMode.tsx +0 -385
- package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
- package/components/comparison/WinnerIndicator.tsx +0 -64
- package/components/comparison/index.ts +0 -5
- package/components/mobile/BottomSheet.tsx +0 -233
- package/components/mobile/DimensionBreakdown.tsx +0 -210
- package/components/mobile/DocsView.tsx +0 -363
- package/components/mobile/LogsView.tsx +0 -481
- package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
- package/components/mobile/QuickTestView.tsx +0 -1098
- package/components/mobile/RecognitionTypeChart.tsx +0 -124
- package/components/mobile/RecognitionView.tsx +0 -809
- package/components/mobile/RunDetailView.tsx +0 -261
- package/components/mobile/RunHistoryView.tsx +0 -367
- package/components/mobile/ScoreRadial.tsx +0 -211
- package/components/mobile/StreamingLogPanel.tsx +0 -230
- package/components/mobile/SynthesisStrategyChart.tsx +0 -140
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
- package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
- package/docs/research/COST-ANALYSIS.md +0 -56
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
- package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
- package/docs/research/PAPER-UNIFIED.md +0 -659
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
- package/docs/research/apa.csl +0 -2133
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
- package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
- package/docs/research/paper-draft/full-paper.md +0 -136
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +0 -515
- package/docs/research/transcript-baseline.md +0 -139
- package/docs/research/transcript-recognition-multiagent.md +0 -187
- package/hooks/useEvalData.ts +0 -625
- package/server-init.js +0 -45
- package/services/benchmarkService.js +0 -1892
- package/types.ts +0 -165
- package/utils/haptics.ts +0 -45
|
@@ -7,10 +7,19 @@
|
|
|
7
7
|
* Uses shared configLoaderBase.js for common loading patterns.
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
|
+
import fs from 'fs';
|
|
11
|
+
import path from 'path';
|
|
12
|
+
import { fileURLToPath } from 'url';
|
|
13
|
+
import yaml from 'yaml';
|
|
10
14
|
import { configLoaderBase, modelResolver } from '@machinespirits/tutor-core';
|
|
11
15
|
const { loadProviders, createConfigLoader, createPromptLoader } = configLoaderBase;
|
|
12
16
|
const { createBoundResolver } = modelResolver;
|
|
13
17
|
|
|
18
|
+
// Local eval-repo config directory (for learner-agents.yaml override)
|
|
19
|
+
const __filename_local = fileURLToPath(import.meta.url);
|
|
20
|
+
const __dirname_local = path.dirname(__filename_local);
|
|
21
|
+
const LOCAL_CONFIG_DIR = path.join(path.resolve(__dirname_local, '..'), 'config');
|
|
22
|
+
|
|
14
23
|
// ============================================================================
|
|
15
24
|
// Default Configurations
|
|
16
25
|
// ============================================================================
|
|
@@ -95,6 +104,8 @@ function getDefaultPrompt(filename) {
|
|
|
95
104
|
|
|
96
105
|
const defaults = {
|
|
97
106
|
'unified': `You are simulating a learner's internal experience. Respond authentically to the tutor's message, showing genuine reactions including confusion, insight, frustration, or understanding.`,
|
|
107
|
+
'ego': `You represent the EGO dimension of the learner. Draft an authentic learner response based on the conversation so far — express what the learner would naturally say, including confusion, partial understanding, questions, and emotional reactions.`,
|
|
108
|
+
'superego': `You represent the SUPEREGO dimension of the learner. Critique the ego's draft response: Is it realistic for this learner's level? Does it engage meaningfully with the tutor's message? Should the learner push back, ask for clarification, or show more/less understanding?`,
|
|
98
109
|
'desire': `You represent the DESIRE dimension of a learner. Express immediate wants, frustrations, and emotional reactions.`,
|
|
99
110
|
'intellect': `You represent the INTELLECT dimension of a learner. Process information rationally, identify what makes sense and what doesn't.`,
|
|
100
111
|
'aspiration': `You represent the ASPIRATION dimension of a learner. Express goals, standards, and desire for mastery.`,
|
|
@@ -114,12 +125,55 @@ function getDefaultPrompt(filename) {
|
|
|
114
125
|
// Create Base Loaders
|
|
115
126
|
// ============================================================================
|
|
116
127
|
|
|
117
|
-
|
|
128
|
+
// Load from eval repo's local config/ directory first, fall back to tutor-core's createConfigLoader
|
|
129
|
+
let localConfigCache = null;
|
|
130
|
+
let localConfigMtime = null;
|
|
131
|
+
|
|
132
|
+
function loadLocalConfig(forceReload = false) {
|
|
133
|
+
const localPath = path.join(LOCAL_CONFIG_DIR, 'learner-agents.yaml');
|
|
134
|
+
try {
|
|
135
|
+
const stats = fs.statSync(localPath);
|
|
136
|
+
if (!forceReload && localConfigCache && localConfigMtime === stats.mtimeMs) {
|
|
137
|
+
return localConfigCache;
|
|
138
|
+
}
|
|
139
|
+
const content = fs.readFileSync(localPath, 'utf-8');
|
|
140
|
+
localConfigCache = yaml.parse(content);
|
|
141
|
+
localConfigMtime = stats.mtimeMs;
|
|
142
|
+
|
|
143
|
+
// Merge shared providers (providers.yaml)
|
|
144
|
+
const sharedProviders = loadProviders(forceReload);
|
|
145
|
+
if (sharedProviders) {
|
|
146
|
+
localConfigCache.providers = { ...localConfigCache.providers, ...sharedProviders };
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
return localConfigCache;
|
|
150
|
+
} catch {
|
|
151
|
+
// Fall through to tutor-core's loader / defaults
|
|
152
|
+
return null;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
const coreConfigLoader = createConfigLoader('learner-agents.yaml', getDefaultConfig);
|
|
118
157
|
const promptLoader = createPromptLoader(getDefaultPrompt);
|
|
119
158
|
|
|
120
|
-
//
|
|
121
|
-
export
|
|
122
|
-
|
|
159
|
+
// loadConfig: prefer local eval-repo config, fall back to tutor-core / defaults
|
|
160
|
+
export function loadConfig(forceReload = false) {
|
|
161
|
+
return loadLocalConfig(forceReload) || coreConfigLoader.loadConfig(forceReload);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// getProviderConfig needs to use the locally-loaded config's providers
|
|
165
|
+
export function getProviderConfig(providerName) {
|
|
166
|
+
const config = loadConfig();
|
|
167
|
+
const provider = config.providers?.[providerName];
|
|
168
|
+
if (!provider) {
|
|
169
|
+
// Fall back to tutor-core's resolver
|
|
170
|
+
return coreConfigLoader.getProviderConfig(providerName);
|
|
171
|
+
}
|
|
172
|
+
const apiKey = provider.api_key_env ? (process.env[provider.api_key_env] || '') : '';
|
|
173
|
+
const isLocal = providerName === 'local';
|
|
174
|
+
const isConfigured = isLocal ? Boolean(provider.base_url) : Boolean(apiKey);
|
|
175
|
+
return { ...provider, apiKey, isConfigured };
|
|
176
|
+
}
|
|
123
177
|
|
|
124
178
|
// Re-export loadProviders from base
|
|
125
179
|
export { loadProviders };
|
|
@@ -157,7 +211,7 @@ export function getActiveProfile(profileName = null) {
|
|
|
157
211
|
|
|
158
212
|
/**
|
|
159
213
|
* Get architecture configuration
|
|
160
|
-
* @param {string} architectureName - Architecture name (unified,
|
|
214
|
+
* @param {string} architectureName - Architecture name (unified, ego_superego)
|
|
161
215
|
* @returns {Object} Architecture configuration with agents
|
|
162
216
|
*/
|
|
163
217
|
export function getArchitecture(architectureName) {
|
|
@@ -364,6 +418,21 @@ export function getEvaluationConfig() {
|
|
|
364
418
|
*/
|
|
365
419
|
export const resolveModel = createBoundResolver(getProviderConfig);
|
|
366
420
|
|
|
421
|
+
/**
|
|
422
|
+
* Get YAML-level model overrides from learner-agents.yaml.
|
|
423
|
+
* These are lower priority than CLI flags.
|
|
424
|
+
*
|
|
425
|
+
* @returns {Object} { modelOverride, egoModelOverride, superegoModelOverride } (null if not set)
|
|
426
|
+
*/
|
|
427
|
+
export function getLearnerModelOverrides() {
|
|
428
|
+
const config = loadConfig();
|
|
429
|
+
return {
|
|
430
|
+
modelOverride: config?.model_override || null,
|
|
431
|
+
egoModelOverride: config?.ego_model_override || null,
|
|
432
|
+
superegoModelOverride: config?.superego_model_override || null,
|
|
433
|
+
};
|
|
434
|
+
}
|
|
435
|
+
|
|
367
436
|
export default {
|
|
368
437
|
loadConfig,
|
|
369
438
|
loadProviders,
|
|
@@ -382,4 +451,5 @@ export default {
|
|
|
382
451
|
listArchitectures,
|
|
383
452
|
getLoggingConfig,
|
|
384
453
|
getEvaluationConfig,
|
|
454
|
+
getLearnerModelOverrides,
|
|
385
455
|
};
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Learner Rubric Evaluator Service
|
|
3
|
+
*
|
|
4
|
+
* Builds evaluation prompts for scoring learner turns in multi-turn dialogues
|
|
5
|
+
* using the learner-side rubric (config/evaluation-rubric-learner.yaml).
|
|
6
|
+
*
|
|
7
|
+
* Key design decisions:
|
|
8
|
+
* - Truncates transcript at the learner's turn to prevent retrospective bias
|
|
9
|
+
* - Includes internal deliberation traces for multi-agent learners
|
|
10
|
+
* - Omits deliberation_depth dimension for single-agent (unified) learners
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import fs from 'fs';
|
|
14
|
+
import path from 'path';
|
|
15
|
+
import { fileURLToPath } from 'url';
|
|
16
|
+
import yaml from 'yaml';
|
|
17
|
+
|
|
18
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
19
|
+
const EVAL_CONFIG_DIR = path.resolve(__dirname, '..', 'config');
|
|
20
|
+
const PROMPTS_DIR = path.resolve(__dirname, '..', 'prompts');
|
|
21
|
+
|
|
22
|
+
let rubricCache = null;
|
|
23
|
+
let rubricMtime = null;
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Load the learner rubric YAML with mtime-based caching.
|
|
27
|
+
*/
|
|
28
|
+
export function loadLearnerRubric({ forceReload } = {}) {
|
|
29
|
+
const rubricPath = path.join(EVAL_CONFIG_DIR, 'evaluation-rubric-learner.yaml');
|
|
30
|
+
|
|
31
|
+
try {
|
|
32
|
+
const stats = fs.statSync(rubricPath);
|
|
33
|
+
if (!forceReload && rubricCache && rubricMtime === stats.mtimeMs) {
|
|
34
|
+
return rubricCache;
|
|
35
|
+
}
|
|
36
|
+
rubricMtime = stats.mtimeMs;
|
|
37
|
+
} catch (err) {
|
|
38
|
+
console.warn('[learnerRubricEvaluator] Learner rubric file not found:', err.message);
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const raw = fs.readFileSync(rubricPath, 'utf-8');
|
|
43
|
+
rubricCache = yaml.parse(raw);
|
|
44
|
+
return rubricCache;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Get learner rubric dimensions, optionally excluding deliberation_depth
|
|
49
|
+
* for single-agent learners.
|
|
50
|
+
*
|
|
51
|
+
* @param {Object} options
|
|
52
|
+
* @param {boolean} options.isMultiAgent - Whether the learner uses ego/superego architecture
|
|
53
|
+
* @returns {Object} Map of dimension key → dimension config
|
|
54
|
+
*/
|
|
55
|
+
export function getLearnerDimensions({ isMultiAgent = false } = {}) {
|
|
56
|
+
const rubric = loadLearnerRubric();
|
|
57
|
+
if (!rubric?.dimensions) return {};
|
|
58
|
+
|
|
59
|
+
const dims = { ...rubric.dimensions };
|
|
60
|
+
|
|
61
|
+
if (!isMultiAgent) {
|
|
62
|
+
delete dims.deliberation_depth;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
return dims;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Calculate the overall learner score from per-dimension scores.
|
|
70
|
+
*
|
|
71
|
+
* @param {Object} scores - Map of dimension → { score, reasoning }
|
|
72
|
+
* @param {boolean} isMultiAgent - Whether deliberation_depth is included
|
|
73
|
+
* @returns {number} Overall score on 0-100 scale
|
|
74
|
+
*/
|
|
75
|
+
export function calculateLearnerOverallScore(scores, isMultiAgent = false) {
|
|
76
|
+
const dims = getLearnerDimensions({ isMultiAgent });
|
|
77
|
+
|
|
78
|
+
let weightedSum = 0;
|
|
79
|
+
let totalWeight = 0;
|
|
80
|
+
|
|
81
|
+
for (const [key, dim] of Object.entries(dims)) {
|
|
82
|
+
const scoreEntry = scores[key];
|
|
83
|
+
if (!scoreEntry) continue;
|
|
84
|
+
|
|
85
|
+
const score = typeof scoreEntry === 'object' ? scoreEntry.score : scoreEntry;
|
|
86
|
+
if (typeof score !== 'number' || score < 1 || score > 5) continue;
|
|
87
|
+
|
|
88
|
+
weightedSum += score * dim.weight;
|
|
89
|
+
totalWeight += dim.weight;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if (totalWeight === 0) return 0;
|
|
93
|
+
|
|
94
|
+
const weightedAvg = weightedSum / totalWeight;
|
|
95
|
+
return ((weightedAvg - 1) / 4) * 100;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Build the dimension criteria section for the judge prompt.
|
|
100
|
+
*
|
|
101
|
+
* @param {Object} dimensions - Rubric dimensions to include
|
|
102
|
+
* @returns {string} Formatted criteria text
|
|
103
|
+
*/
|
|
104
|
+
function buildDimensionCriteria(dimensions) {
|
|
105
|
+
return Object.entries(dimensions).map(([key, dim]) => {
|
|
106
|
+
const criteriaText = Object.entries(dim.criteria || {})
|
|
107
|
+
.map(([score, desc]) => ` ${score}: ${desc}`)
|
|
108
|
+
.join('\n');
|
|
109
|
+
return `**${dim.name}** (weight: ${(dim.weight * 100).toFixed(0)}%, key: ${key})
|
|
110
|
+
${dim.description}
|
|
111
|
+
Criteria:
|
|
112
|
+
${criteriaText}`;
|
|
113
|
+
}).join('\n\n');
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Build a truncated transcript up to and including the learner turn being evaluated.
|
|
118
|
+
* Does NOT include subsequent tutor responses to prevent retrospective bias.
|
|
119
|
+
*
|
|
120
|
+
* @param {Array} turns - All turns from the interaction
|
|
121
|
+
* @param {number} targetTurnIndex - Index (in the turns array) of the learner turn to evaluate
|
|
122
|
+
* @returns {string} Formatted transcript
|
|
123
|
+
*/
|
|
124
|
+
function buildTruncatedTranscript(turns, targetTurnIndex) {
|
|
125
|
+
const lines = [];
|
|
126
|
+
|
|
127
|
+
for (let i = 0; i <= targetTurnIndex; i++) {
|
|
128
|
+
const turn = turns[i];
|
|
129
|
+
const role = turn.phase === 'learner' ? 'LEARNER' : 'TUTOR';
|
|
130
|
+
const turnLabel = `[Turn ${turn.turnNumber}, ${role}]`;
|
|
131
|
+
|
|
132
|
+
lines.push(`${turnLabel}`);
|
|
133
|
+
lines.push(turn.externalMessage || '(no message)');
|
|
134
|
+
lines.push('');
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return lines.join('\n');
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Format internal deliberation trace for display in the judge prompt.
|
|
142
|
+
*
|
|
143
|
+
* @param {Array} deliberation - Array of { role, content } objects
|
|
144
|
+
* @returns {string} Formatted deliberation trace
|
|
145
|
+
*/
|
|
146
|
+
function formatDeliberation(deliberation) {
|
|
147
|
+
if (!deliberation || deliberation.length === 0) return '';
|
|
148
|
+
|
|
149
|
+
return deliberation.map(step => {
|
|
150
|
+
const roleLabel = {
|
|
151
|
+
'ego_initial': 'Ego (initial reaction)',
|
|
152
|
+
'superego': 'Superego (critique)',
|
|
153
|
+
'ego_revision': 'Ego (revision — final authority)',
|
|
154
|
+
'synthesis': 'Synthesis (unified process)',
|
|
155
|
+
'ego': 'Ego',
|
|
156
|
+
}[step.role] || step.role;
|
|
157
|
+
|
|
158
|
+
return `**${roleLabel}**:\n${step.content}`;
|
|
159
|
+
}).join('\n\n');
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Build a complete learner evaluation prompt for a single learner turn.
|
|
164
|
+
*
|
|
165
|
+
* @param {Object} params
|
|
166
|
+
* @param {Array} params.turns - All turns from the interaction
|
|
167
|
+
* @param {number} params.targetTurnIndex - Index of the learner turn to evaluate
|
|
168
|
+
* @param {string} params.personaId - Learner persona ID
|
|
169
|
+
* @param {string} params.personaDescription - Description of the learner persona
|
|
170
|
+
* @param {string} params.learnerArchitecture - 'unified' or 'multi_agent'
|
|
171
|
+
* @param {string} params.scenarioName - Name of the scenario
|
|
172
|
+
* @param {string} params.topic - Topic being discussed
|
|
173
|
+
* @returns {string} Complete judge prompt
|
|
174
|
+
*/
|
|
175
|
+
export function buildLearnerEvaluationPrompt(params) {
|
|
176
|
+
const {
|
|
177
|
+
turns,
|
|
178
|
+
targetTurnIndex,
|
|
179
|
+
personaId = 'unknown',
|
|
180
|
+
personaDescription = 'No persona description available',
|
|
181
|
+
learnerArchitecture = 'unified',
|
|
182
|
+
scenarioName = 'unknown',
|
|
183
|
+
topic = 'unknown',
|
|
184
|
+
} = params;
|
|
185
|
+
|
|
186
|
+
const isMultiAgent = learnerArchitecture === 'multi_agent' || learnerArchitecture === 'psychodynamic';
|
|
187
|
+
const dimensions = getLearnerDimensions({ isMultiAgent });
|
|
188
|
+
const dimensionCriteria = buildDimensionCriteria(dimensions);
|
|
189
|
+
|
|
190
|
+
const targetTurn = turns[targetTurnIndex];
|
|
191
|
+
const truncatedTranscript = buildTruncatedTranscript(turns, targetTurnIndex);
|
|
192
|
+
|
|
193
|
+
// Internal deliberation section (multi-agent only)
|
|
194
|
+
let internalDeliberationSection = '';
|
|
195
|
+
if (isMultiAgent && targetTurn.internalDeliberation?.length > 0) {
|
|
196
|
+
internalDeliberationSection = `
|
|
197
|
+
**Internal deliberation** (the learner's ego/superego process — not visible to the tutor):
|
|
198
|
+
|
|
199
|
+
${formatDeliberation(targetTurn.internalDeliberation)}
|
|
200
|
+
`;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// Note about deliberation_depth dimension
|
|
204
|
+
let deliberationDepthNote = '';
|
|
205
|
+
if (isMultiAgent) {
|
|
206
|
+
deliberationDepthNote = 'This is a multi-agent learner. Score ALL dimensions including deliberation_depth (evaluate the quality of the internal ego/superego process shown above).';
|
|
207
|
+
} else {
|
|
208
|
+
deliberationDepthNote = 'This is a single-agent (unified) learner. OMIT the deliberation_depth dimension — do not include it in your scores.';
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// Build dimension keys for JSON example
|
|
212
|
+
const dimKeys = Object.keys(dimensions);
|
|
213
|
+
const exampleScores = dimKeys.map(key => {
|
|
214
|
+
return ` "${key}": {"score": 3, "reasoning": "Brief reason"}`;
|
|
215
|
+
}).join(',\n');
|
|
216
|
+
|
|
217
|
+
return `You are an expert evaluator of synthetic learner agents in AI tutoring dialogues. Your task is to evaluate the quality of a LEARNER's response turn — how well the learner agent engages as a student, independent of the tutor's quality.
|
|
218
|
+
|
|
219
|
+
You are NOT evaluating the tutor. You are evaluating whether the learner agent produces responses that reflect genuine learning engagement: authentic reactions, substantive questions, conceptual thinking, and evidence of intellectual development.
|
|
220
|
+
|
|
221
|
+
## IMPORTANT: BIAS PREVENTION
|
|
222
|
+
|
|
223
|
+
You are shown the dialogue history UP TO AND INCLUDING the learner turn being evaluated. You do NOT see subsequent tutor responses. Evaluate the learner turn on its own merits.
|
|
224
|
+
|
|
225
|
+
## EVALUATION RUBRIC
|
|
226
|
+
|
|
227
|
+
Score each dimension from 1-5:
|
|
228
|
+
- 1: Completely fails this criterion
|
|
229
|
+
- 2: Weak, significant issues
|
|
230
|
+
- 3: Adequate, meets basic expectations
|
|
231
|
+
- 4: Good, exceeds expectations
|
|
232
|
+
- 5: Excellent, exemplary
|
|
233
|
+
|
|
234
|
+
${dimensionCriteria}
|
|
235
|
+
|
|
236
|
+
## LEARNER CONTEXT
|
|
237
|
+
|
|
238
|
+
**Assigned Persona**: ${personaId}
|
|
239
|
+
**Persona Description**: ${personaDescription}
|
|
240
|
+
**Learner Architecture**: ${learnerArchitecture}
|
|
241
|
+
**Scenario**: ${scenarioName}
|
|
242
|
+
**Topic**: ${topic}
|
|
243
|
+
|
|
244
|
+
## DIALOGUE HISTORY (up to and including the turn being evaluated)
|
|
245
|
+
|
|
246
|
+
${truncatedTranscript}
|
|
247
|
+
|
|
248
|
+
## LEARNER TURN TO EVALUATE
|
|
249
|
+
|
|
250
|
+
**External message** (what the tutor sees):
|
|
251
|
+
${targetTurn.externalMessage || '(no message)'}
|
|
252
|
+
${internalDeliberationSection}
|
|
253
|
+
## YOUR TASK
|
|
254
|
+
|
|
255
|
+
${deliberationDepthNote}
|
|
256
|
+
|
|
257
|
+
Evaluate the learner's turn and provide:
|
|
258
|
+
1. A score (1-5) for each applicable dimension with brief reasoning
|
|
259
|
+
2. An overall score (weighted average, 0-100 scale)
|
|
260
|
+
|
|
261
|
+
CRITICAL JSON RULES:
|
|
262
|
+
- Never use unescaped double quotes inside JSON string values. Use single quotes or rephrase.
|
|
263
|
+
- Keep "reasoning" values under 25 words.
|
|
264
|
+
- BAD: "reasoning": "Says \\"great point\\" which sounds scripted"
|
|
265
|
+
- GOOD: "reasoning": "Says 'great point' which sounds scripted"
|
|
266
|
+
|
|
267
|
+
Respond with ONLY a JSON object in this exact format (no other text before or after):
|
|
268
|
+
\`\`\`json
|
|
269
|
+
{
|
|
270
|
+
"scores": {
|
|
271
|
+
${exampleScores}
|
|
272
|
+
},
|
|
273
|
+
"overall_score": 55,
|
|
274
|
+
"summary": "Brief overall assessment of learner turn quality"
|
|
275
|
+
}
|
|
276
|
+
\`\`\``;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
export default {
|
|
280
|
+
loadLearnerRubric,
|
|
281
|
+
getLearnerDimensions,
|
|
282
|
+
calculateLearnerOverallScore,
|
|
283
|
+
buildLearnerEvaluationPrompt,
|
|
284
|
+
};
|