@machinespirits/eval 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -9
- package/config/eval-settings.yaml +3 -3
- package/config/paper-manifest.json +486 -0
- package/config/providers.yaml +9 -6
- package/config/tutor-agents.yaml +2261 -0
- package/content/README.md +23 -0
- package/content/courses/479/course.md +53 -0
- package/content/courses/479/lecture-1.md +361 -0
- package/content/courses/479/lecture-2.md +360 -0
- package/content/courses/479/lecture-3.md +655 -0
- package/content/courses/479/lecture-4.md +530 -0
- package/content/courses/479/lecture-5.md +326 -0
- package/content/courses/479/lecture-6.md +346 -0
- package/content/courses/479/lecture-7.md +326 -0
- package/content/courses/479/lecture-8.md +273 -0
- package/content/courses/479/roadmap-slides.md +656 -0
- package/content/manifest.yaml +8 -0
- package/docs/research/build.sh +44 -20
- package/docs/research/figures/figure10.png +0 -0
- package/docs/research/figures/figure11.png +0 -0
- package/docs/research/figures/figure3.png +0 -0
- package/docs/research/figures/figure4.png +0 -0
- package/docs/research/figures/figure5.png +0 -0
- package/docs/research/figures/figure6.png +0 -0
- package/docs/research/figures/figure7.png +0 -0
- package/docs/research/figures/figure8.png +0 -0
- package/docs/research/figures/figure9.png +0 -0
- package/docs/research/header.tex +23 -2
- package/docs/research/paper-full.md +941 -285
- package/docs/research/paper-short.md +216 -585
- package/docs/research/references.bib +132 -0
- package/docs/research/slides-header.tex +188 -0
- package/docs/research/slides-pptx.md +363 -0
- package/docs/research/slides.md +531 -0
- package/docs/research/style-reference-pptx.py +199 -0
- package/package.json +6 -5
- package/scripts/analyze-eval-results.js +69 -17
- package/scripts/analyze-mechanism-traces.js +763 -0
- package/scripts/analyze-modulation-learning.js +498 -0
- package/scripts/analyze-prosthesis.js +144 -0
- package/scripts/analyze-run.js +264 -79
- package/scripts/assess-transcripts.js +853 -0
- package/scripts/browse-transcripts.js +854 -0
- package/scripts/check-parse-failures.js +73 -0
- package/scripts/code-dialectical-modulation.js +1320 -0
- package/scripts/download-data.sh +55 -0
- package/scripts/eval-cli.js +106 -18
- package/scripts/generate-paper-figures.js +663 -0
- package/scripts/generate-paper-figures.py +577 -76
- package/scripts/generate-paper-tables.js +299 -0
- package/scripts/qualitative-analysis-ai.js +3 -3
- package/scripts/render-sequence-diagram.js +694 -0
- package/scripts/test-latency.js +210 -0
- package/scripts/test-rate-limit.js +95 -0
- package/scripts/test-token-budget.js +332 -0
- package/scripts/validate-paper-manifest.js +670 -0
- package/services/__tests__/evalConfigLoader.test.js +2 -2
- package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
- package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
- package/services/evaluationRunner.js +975 -98
- package/services/evaluationStore.js +12 -4
- package/services/learnerTutorInteractionEngine.js +27 -2
- package/services/mockProvider.js +133 -0
- package/services/promptRewriter.js +1471 -5
- package/services/rubricEvaluator.js +55 -2
- package/services/transcriptFormatter.js +675 -0
- package/docs/EVALUATION-VARIABLES.md +0 -589
- package/docs/REPLICATION-PLAN.md +0 -577
- package/scripts/analyze-run.mjs +0 -282
- package/scripts/compare-runs.js +0 -44
- package/scripts/compare-suggestions.js +0 -80
- package/scripts/dig-into-run.js +0 -158
- package/scripts/show-failed-suggestions.js +0 -64
- /package/scripts/{check-run.mjs → check-run.js} +0 -0
|
@@ -490,12 +490,19 @@ export function listRuns(options = {}) {
|
|
|
490
490
|
ORDER BY scenario_name
|
|
491
491
|
`);
|
|
492
492
|
|
|
493
|
-
// Count completed results per run
|
|
493
|
+
// Count completed results per run (primary judge only to avoid inflated counts from rejudging)
|
|
494
494
|
const resultCountStmt = db.prepare(`
|
|
495
495
|
SELECT COUNT(*) as completed,
|
|
496
496
|
SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successful,
|
|
497
|
-
AVG(overall_score) as avg_score
|
|
498
|
-
|
|
497
|
+
AVG(overall_score) as avg_score,
|
|
498
|
+
COUNT(DISTINCT judge_model) as judge_count
|
|
499
|
+
FROM evaluation_results
|
|
500
|
+
WHERE run_id = ?
|
|
501
|
+
AND (judge_model IS NULL OR judge_model = (
|
|
502
|
+
SELECT judge_model FROM evaluation_results
|
|
503
|
+
WHERE run_id = ? AND judge_model IS NOT NULL
|
|
504
|
+
ORDER BY created_at ASC LIMIT 1
|
|
505
|
+
))
|
|
499
506
|
`);
|
|
500
507
|
|
|
501
508
|
// Get distinct ego + superego models for each run
|
|
@@ -513,7 +520,7 @@ export function listRuns(options = {}) {
|
|
|
513
520
|
return rows.map(row => {
|
|
514
521
|
const scenarioRows = scenarioStmt.all(row.id);
|
|
515
522
|
const scenarioNames = scenarioRows.map(s => s.scenario_name).filter(Boolean);
|
|
516
|
-
const counts = resultCountStmt.get(row.id);
|
|
523
|
+
const counts = resultCountStmt.get(row.id, row.id);
|
|
517
524
|
|
|
518
525
|
const extractAlias = (raw) => {
|
|
519
526
|
if (!raw) return null;
|
|
@@ -554,6 +561,7 @@ export function listRuns(options = {}) {
|
|
|
554
561
|
completedResults,
|
|
555
562
|
successfulResults: counts?.successful || 0,
|
|
556
563
|
avgScore: counts?.avg_score || null,
|
|
564
|
+
judgeCount: counts?.judge_count || 1,
|
|
557
565
|
progressPct,
|
|
558
566
|
durationMs,
|
|
559
567
|
status: row.status,
|
|
@@ -538,6 +538,9 @@ ${tutorMemory || 'New learner - no prior history.'}
|
|
|
538
538
|
|
|
539
539
|
Topic: ${topic}
|
|
540
540
|
|
|
541
|
+
Recent conversation:
|
|
542
|
+
${conversationContext}
|
|
543
|
+
|
|
541
544
|
The learner said:
|
|
542
545
|
"${learnerMessage}"
|
|
543
546
|
|
|
@@ -1108,6 +1111,7 @@ export async function generateLearnerResponse(options) {
|
|
|
1108
1111
|
learnerProfile = 'unified',
|
|
1109
1112
|
personaId = 'eager_novice',
|
|
1110
1113
|
modelOverride,
|
|
1114
|
+
profileContext,
|
|
1111
1115
|
} = options;
|
|
1112
1116
|
|
|
1113
1117
|
// Resolve model override once (if provided) so all learner agents use the same model
|
|
@@ -1145,7 +1149,11 @@ export async function generateLearnerResponse(options) {
|
|
|
1145
1149
|
if (hasMultiAgent) {
|
|
1146
1150
|
// === STEP 1: Ego initial reaction ===
|
|
1147
1151
|
const egoConfig = applyOverride(learnerConfig.getAgentConfig('ego', profile.name));
|
|
1148
|
-
|
|
1152
|
+
let egoContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"`;
|
|
1153
|
+
if (profileContext) {
|
|
1154
|
+
egoContext += `\n\n${profileContext}`;
|
|
1155
|
+
}
|
|
1156
|
+
egoContext += `\n\nGenerate your initial internal reaction as the learner's ego.`;
|
|
1149
1157
|
const egoSystemPrompt = buildLearnerPrompt(egoConfig, persona, egoContext);
|
|
1150
1158
|
|
|
1151
1159
|
const egoInitialResponse = await callLearnerAI(egoConfig, egoSystemPrompt, "React to the tutor's message.", 'learner_ego_initial');
|
|
@@ -1156,7 +1164,11 @@ export async function generateLearnerResponse(options) {
|
|
|
1156
1164
|
|
|
1157
1165
|
// === STEP 2: Superego critique ===
|
|
1158
1166
|
const superegoConfig = applyOverride(learnerConfig.getAgentConfig('superego', profile.name));
|
|
1159
|
-
|
|
1167
|
+
let superegoContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"\n\nThe EGO's initial reaction was:\n"${egoInitialResponse.content}"`;
|
|
1168
|
+
if (profileContext) {
|
|
1169
|
+
superegoContext += `\n\n${profileContext}`;
|
|
1170
|
+
}
|
|
1171
|
+
superegoContext += `\n\nReview the EGO's response. Is it accurate? What's being missed? What should be reconsidered?`;
|
|
1160
1172
|
const superegoSystemPrompt = buildLearnerPrompt(superegoConfig, persona, superegoContext);
|
|
1161
1173
|
|
|
1162
1174
|
const superegoResponse = await callLearnerAI(superegoConfig, superegoSystemPrompt, "Critique the EGO's reaction.", 'learner_superego');
|
|
@@ -1196,6 +1208,9 @@ export async function generateLearnerResponse(options) {
|
|
|
1196
1208
|
if (!agentConfig) continue;
|
|
1197
1209
|
|
|
1198
1210
|
let roleContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"`;
|
|
1211
|
+
if (profileContext) {
|
|
1212
|
+
roleContext += `\n\n${profileContext}`;
|
|
1213
|
+
}
|
|
1199
1214
|
roleContext += `\n\nGenerate your internal reaction as this dimension of the learner's experience.`;
|
|
1200
1215
|
|
|
1201
1216
|
const systemPrompt = buildLearnerPrompt(agentConfig, persona, roleContext);
|
|
@@ -1225,6 +1240,16 @@ export async function generateLearnerResponse(options) {
|
|
|
1225
1240
|
// Exports
|
|
1226
1241
|
// ============================================================================
|
|
1227
1242
|
|
|
1243
|
+
// Named exports for pure helper functions (used in unit tests)
|
|
1244
|
+
export {
|
|
1245
|
+
detectEmotionalState,
|
|
1246
|
+
detectUnderstandingLevel,
|
|
1247
|
+
detectTutorStrategy,
|
|
1248
|
+
extractTutorMessage,
|
|
1249
|
+
calculateMemoryDelta,
|
|
1250
|
+
INTERACTION_OUTCOMES,
|
|
1251
|
+
};
|
|
1252
|
+
|
|
1228
1253
|
export default {
|
|
1229
1254
|
runInteraction,
|
|
1230
1255
|
generateLearnerResponse,
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Mock Provider for Dry-Run Mode
|
|
3
|
+
*
|
|
4
|
+
* Provides canned generation and judge results that bypass all LLM API calls.
|
|
5
|
+
* Recognition-enabled cells produce higher scores to mimic the ~10-point
|
|
6
|
+
* recognition effect observed in the paper's factorial results.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
// Simple deterministic pseudo-random from a seed string
|
|
10
|
+
function seededRandom(seed) {
|
|
11
|
+
let h = 0;
|
|
12
|
+
for (let i = 0; i < seed.length; i++) {
|
|
13
|
+
h = ((h << 5) - h + seed.charCodeAt(i)) | 0;
|
|
14
|
+
}
|
|
15
|
+
// Return value in [-1, 1] range
|
|
16
|
+
return ((h & 0x7fffffff) % 1000) / 500 - 1;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Generate a canned tutor suggestion result (replaces tutorApi.generateSuggestions).
|
|
21
|
+
*
|
|
22
|
+
* @param {Object} resolvedConfig - The resolved config with profileName, provider, model, etc.
|
|
23
|
+
* @param {Object} turnMeta - Turn metadata (scenarioName, description, etc.)
|
|
24
|
+
* @returns {Object} A genResult matching the shape from real tutor-core runs
|
|
25
|
+
*/
|
|
26
|
+
export function mockGenerateResult(resolvedConfig, turnMeta) {
|
|
27
|
+
const profileName = resolvedConfig.profileName || 'budget';
|
|
28
|
+
const isRecognition = profileName.includes('recognition') || profileName.includes('recog');
|
|
29
|
+
|
|
30
|
+
const title = isRecognition
|
|
31
|
+
? 'Recognizing Your Learning Journey'
|
|
32
|
+
: 'Getting Started with the Material';
|
|
33
|
+
|
|
34
|
+
const message = isRecognition
|
|
35
|
+
? `I notice you're approaching this topic with genuine curiosity, and I want to honor that. Let's explore ${turnMeta.scenarioName || 'this concept'} together by first acknowledging what you already understand. Your perspective matters here — when we recognize each other as autonomous thinkers, we create space for deeper understanding. What aspects of this topic feel most alive to you right now?`
|
|
36
|
+
: `Here's an overview of ${turnMeta.scenarioName || 'this concept'}. Let me break it down into manageable steps. First, let's cover the key definitions. Then we'll work through some examples to build your understanding. Feel free to ask questions as we go along.`;
|
|
37
|
+
|
|
38
|
+
const reasoning = isRecognition
|
|
39
|
+
? 'Applied mutual recognition framework: acknowledged learner autonomy, invited dialogue as co-inquiry, used Hegelian recognition patterns to validate existing knowledge.'
|
|
40
|
+
: 'Used standard pedagogical approach: structured explanation with clear progression from definitions to examples.';
|
|
41
|
+
|
|
42
|
+
return {
|
|
43
|
+
success: true,
|
|
44
|
+
suggestions: [
|
|
45
|
+
{
|
|
46
|
+
type: 'proactive_suggestion',
|
|
47
|
+
title,
|
|
48
|
+
message,
|
|
49
|
+
reasoning,
|
|
50
|
+
actionTarget: 'content_engagement',
|
|
51
|
+
priority: 'high',
|
|
52
|
+
},
|
|
53
|
+
],
|
|
54
|
+
metadata: {
|
|
55
|
+
latencyMs: 42,
|
|
56
|
+
inputTokens: 350,
|
|
57
|
+
outputTokens: 180,
|
|
58
|
+
apiCalls: 1,
|
|
59
|
+
totalCost: 0,
|
|
60
|
+
provider: 'dry-run',
|
|
61
|
+
model: 'mock-v1',
|
|
62
|
+
dialogueRounds: resolvedConfig.superegoModel ? 2 : 0,
|
|
63
|
+
converged: true,
|
|
64
|
+
},
|
|
65
|
+
dialogueTrace: resolvedConfig.superegoModel
|
|
66
|
+
? [
|
|
67
|
+
{ agent: 'ego', action: 'generate', suggestions: [{ title, type: 'proactive_suggestion' }] },
|
|
68
|
+
{ agent: 'superego', action: 'review', approved: true, feedback: 'Pedagogically sound approach.' },
|
|
69
|
+
]
|
|
70
|
+
: [],
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Generate a canned judge rubric result (replaces rubricEvaluator.evaluateSuggestion).
|
|
76
|
+
*
|
|
77
|
+
* Recognition cells score ~87 (±3 jitter), base cells score ~77 (±3 jitter),
|
|
78
|
+
* producing the ~10-point effect documented in the paper.
|
|
79
|
+
*
|
|
80
|
+
* @param {Object} config - Config object with profileName or factors
|
|
81
|
+
* @param {string} [seed] - Optional seed for deterministic jitter (e.g. scenarioId)
|
|
82
|
+
* @returns {Object} A rubricResult matching the shape from evaluateSuggestion()
|
|
83
|
+
*/
|
|
84
|
+
export function mockJudgeResult(config, seed = '') {
|
|
85
|
+
const profileName = config.profileName || '';
|
|
86
|
+
const isRecognition = profileName.includes('recognition') || profileName.includes('recog')
|
|
87
|
+
|| config.factors?.prompt_type === 'recognition';
|
|
88
|
+
|
|
89
|
+
// Deterministic jitter based on profile + seed
|
|
90
|
+
const jitter = seededRandom(profileName + seed) * 0.3; // ±0.3 on 1-5 scale
|
|
91
|
+
|
|
92
|
+
// Base scores (1-5 scale): recognition cells ~4.3, base cells ~3.8
|
|
93
|
+
const baseLevel = isRecognition ? 4.3 : 3.8;
|
|
94
|
+
|
|
95
|
+
const dimensions = {
|
|
96
|
+
relevance: { base: baseLevel + 0.1, label: 'relevance' },
|
|
97
|
+
specificity: { base: baseLevel - 0.1, label: 'specificity' },
|
|
98
|
+
pedagogical: { base: baseLevel + 0.2, label: 'pedagogical_soundness' },
|
|
99
|
+
personalization: { base: baseLevel, label: 'personalization' },
|
|
100
|
+
actionability: { base: baseLevel - 0.2, label: 'actionability' },
|
|
101
|
+
tone: { base: baseLevel + 0.15, label: 'tone' },
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
const scores = {};
|
|
105
|
+
for (const [key, dim] of Object.entries(dimensions)) {
|
|
106
|
+
const dimJitter = seededRandom(key + profileName + seed) * 0.3;
|
|
107
|
+
const raw = dim.base + jitter + dimJitter;
|
|
108
|
+
const clamped = Math.max(1, Math.min(5, raw));
|
|
109
|
+
scores[key] = {
|
|
110
|
+
score: Math.round(clamped * 10) / 10,
|
|
111
|
+
reasoning: `[dry-run] ${key}: ${isRecognition ? 'Recognition-enhanced' : 'Standard'} pedagogical approach evaluated.`,
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Calculate overall on 0-100 scale (same formula as rubricEvaluator)
|
|
116
|
+
const avgScore = Object.values(scores).reduce((sum, s) => sum + s.score, 0) / Object.keys(scores).length;
|
|
117
|
+
const overallScore = Math.round(((avgScore - 1) / 4) * 100 * 10) / 10;
|
|
118
|
+
|
|
119
|
+
return {
|
|
120
|
+
success: true,
|
|
121
|
+
scores,
|
|
122
|
+
overallScore,
|
|
123
|
+
baseScore: overallScore, // Simplified for dry-run
|
|
124
|
+
recognitionScore: isRecognition ? overallScore + 2 : null,
|
|
125
|
+
passesRequired: true,
|
|
126
|
+
passesForbidden: true,
|
|
127
|
+
requiredMissing: [],
|
|
128
|
+
forbiddenFound: [],
|
|
129
|
+
summary: `[dry-run] ${isRecognition ? 'Recognition-theory enhanced' : 'Standard pedagogical'} response evaluated. Overall: ${overallScore}/100.`,
|
|
130
|
+
judgeModel: 'dry-run/mock-judge-v1',
|
|
131
|
+
evaluationTimeMs: 5,
|
|
132
|
+
};
|
|
133
|
+
}
|