@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/build.sh +44 -20
  19. package/docs/research/figures/figure10.png +0 -0
  20. package/docs/research/figures/figure11.png +0 -0
  21. package/docs/research/figures/figure3.png +0 -0
  22. package/docs/research/figures/figure4.png +0 -0
  23. package/docs/research/figures/figure5.png +0 -0
  24. package/docs/research/figures/figure6.png +0 -0
  25. package/docs/research/figures/figure7.png +0 -0
  26. package/docs/research/figures/figure8.png +0 -0
  27. package/docs/research/figures/figure9.png +0 -0
  28. package/docs/research/header.tex +23 -2
  29. package/docs/research/paper-full.md +941 -285
  30. package/docs/research/paper-short.md +216 -585
  31. package/docs/research/references.bib +132 -0
  32. package/docs/research/slides-header.tex +188 -0
  33. package/docs/research/slides-pptx.md +363 -0
  34. package/docs/research/slides.md +531 -0
  35. package/docs/research/style-reference-pptx.py +199 -0
  36. package/package.json +6 -5
  37. package/scripts/analyze-eval-results.js +69 -17
  38. package/scripts/analyze-mechanism-traces.js +763 -0
  39. package/scripts/analyze-modulation-learning.js +498 -0
  40. package/scripts/analyze-prosthesis.js +144 -0
  41. package/scripts/analyze-run.js +264 -79
  42. package/scripts/assess-transcripts.js +853 -0
  43. package/scripts/browse-transcripts.js +854 -0
  44. package/scripts/check-parse-failures.js +73 -0
  45. package/scripts/code-dialectical-modulation.js +1320 -0
  46. package/scripts/download-data.sh +55 -0
  47. package/scripts/eval-cli.js +106 -18
  48. package/scripts/generate-paper-figures.js +663 -0
  49. package/scripts/generate-paper-figures.py +577 -76
  50. package/scripts/generate-paper-tables.js +299 -0
  51. package/scripts/qualitative-analysis-ai.js +3 -3
  52. package/scripts/render-sequence-diagram.js +694 -0
  53. package/scripts/test-latency.js +210 -0
  54. package/scripts/test-rate-limit.js +95 -0
  55. package/scripts/test-token-budget.js +332 -0
  56. package/scripts/validate-paper-manifest.js +670 -0
  57. package/services/__tests__/evalConfigLoader.test.js +2 -2
  58. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  59. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  60. package/services/evaluationRunner.js +975 -98
  61. package/services/evaluationStore.js +12 -4
  62. package/services/learnerTutorInteractionEngine.js +27 -2
  63. package/services/mockProvider.js +133 -0
  64. package/services/promptRewriter.js +1471 -5
  65. package/services/rubricEvaluator.js +55 -2
  66. package/services/transcriptFormatter.js +675 -0
  67. package/docs/EVALUATION-VARIABLES.md +0 -589
  68. package/docs/REPLICATION-PLAN.md +0 -577
  69. package/scripts/analyze-run.mjs +0 -282
  70. package/scripts/compare-runs.js +0 -44
  71. package/scripts/compare-suggestions.js +0 -80
  72. package/scripts/dig-into-run.js +0 -158
  73. package/scripts/show-failed-suggestions.js +0 -64
  74. /package/scripts/{check-run.mjs → check-run.js} +0 -0
@@ -490,12 +490,19 @@ export function listRuns(options = {}) {
490
490
  ORDER BY scenario_name
491
491
  `);
492
492
 
493
- // Count completed results per run
493
+ // Count completed results per run (primary judge only to avoid inflated counts from rejudging)
494
494
  const resultCountStmt = db.prepare(`
495
495
  SELECT COUNT(*) as completed,
496
496
  SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successful,
497
- AVG(overall_score) as avg_score
498
- FROM evaluation_results WHERE run_id = ?
497
+ AVG(overall_score) as avg_score,
498
+ COUNT(DISTINCT judge_model) as judge_count
499
+ FROM evaluation_results
500
+ WHERE run_id = ?
501
+ AND (judge_model IS NULL OR judge_model = (
502
+ SELECT judge_model FROM evaluation_results
503
+ WHERE run_id = ? AND judge_model IS NOT NULL
504
+ ORDER BY created_at ASC LIMIT 1
505
+ ))
499
506
  `);
500
507
 
501
508
  // Get distinct ego + superego models for each run
@@ -513,7 +520,7 @@ export function listRuns(options = {}) {
513
520
  return rows.map(row => {
514
521
  const scenarioRows = scenarioStmt.all(row.id);
515
522
  const scenarioNames = scenarioRows.map(s => s.scenario_name).filter(Boolean);
516
- const counts = resultCountStmt.get(row.id);
523
+ const counts = resultCountStmt.get(row.id, row.id);
517
524
 
518
525
  const extractAlias = (raw) => {
519
526
  if (!raw) return null;
@@ -554,6 +561,7 @@ export function listRuns(options = {}) {
554
561
  completedResults,
555
562
  successfulResults: counts?.successful || 0,
556
563
  avgScore: counts?.avg_score || null,
564
+ judgeCount: counts?.judge_count || 1,
557
565
  progressPct,
558
566
  durationMs,
559
567
  status: row.status,
@@ -538,6 +538,9 @@ ${tutorMemory || 'New learner - no prior history.'}
538
538
 
539
539
  Topic: ${topic}
540
540
 
541
+ Recent conversation:
542
+ ${conversationContext}
543
+
541
544
  The learner said:
542
545
  "${learnerMessage}"
543
546
 
@@ -1108,6 +1111,7 @@ export async function generateLearnerResponse(options) {
1108
1111
  learnerProfile = 'unified',
1109
1112
  personaId = 'eager_novice',
1110
1113
  modelOverride,
1114
+ profileContext,
1111
1115
  } = options;
1112
1116
 
1113
1117
  // Resolve model override once (if provided) so all learner agents use the same model
@@ -1145,7 +1149,11 @@ export async function generateLearnerResponse(options) {
1145
1149
  if (hasMultiAgent) {
1146
1150
  // === STEP 1: Ego initial reaction ===
1147
1151
  const egoConfig = applyOverride(learnerConfig.getAgentConfig('ego', profile.name));
1148
- const egoContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"\n\nGenerate your initial internal reaction as the learner's ego.`;
1152
+ let egoContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"`;
1153
+ if (profileContext) {
1154
+ egoContext += `\n\n${profileContext}`;
1155
+ }
1156
+ egoContext += `\n\nGenerate your initial internal reaction as the learner's ego.`;
1149
1157
  const egoSystemPrompt = buildLearnerPrompt(egoConfig, persona, egoContext);
1150
1158
 
1151
1159
  const egoInitialResponse = await callLearnerAI(egoConfig, egoSystemPrompt, "React to the tutor's message.", 'learner_ego_initial');
@@ -1156,7 +1164,11 @@ export async function generateLearnerResponse(options) {
1156
1164
 
1157
1165
  // === STEP 2: Superego critique ===
1158
1166
  const superegoConfig = applyOverride(learnerConfig.getAgentConfig('superego', profile.name));
1159
- const superegoContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"\n\nThe EGO's initial reaction was:\n"${egoInitialResponse.content}"\n\nReview the EGO's response. Is it accurate? What's being missed? What should be reconsidered?`;
1167
+ let superegoContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"\n\nThe EGO's initial reaction was:\n"${egoInitialResponse.content}"`;
1168
+ if (profileContext) {
1169
+ superegoContext += `\n\n${profileContext}`;
1170
+ }
1171
+ superegoContext += `\n\nReview the EGO's response. Is it accurate? What's being missed? What should be reconsidered?`;
1160
1172
  const superegoSystemPrompt = buildLearnerPrompt(superegoConfig, persona, superegoContext);
1161
1173
 
1162
1174
  const superegoResponse = await callLearnerAI(superegoConfig, superegoSystemPrompt, "Critique the EGO's reaction.", 'learner_superego');
@@ -1196,6 +1208,9 @@ export async function generateLearnerResponse(options) {
1196
1208
  if (!agentConfig) continue;
1197
1209
 
1198
1210
  let roleContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"`;
1211
+ if (profileContext) {
1212
+ roleContext += `\n\n${profileContext}`;
1213
+ }
1199
1214
  roleContext += `\n\nGenerate your internal reaction as this dimension of the learner's experience.`;
1200
1215
 
1201
1216
  const systemPrompt = buildLearnerPrompt(agentConfig, persona, roleContext);
@@ -1225,6 +1240,16 @@ export async function generateLearnerResponse(options) {
1225
1240
  // Exports
1226
1241
  // ============================================================================
1227
1242
 
1243
+ // Named exports for pure helper functions (used in unit tests)
1244
+ export {
1245
+ detectEmotionalState,
1246
+ detectUnderstandingLevel,
1247
+ detectTutorStrategy,
1248
+ extractTutorMessage,
1249
+ calculateMemoryDelta,
1250
+ INTERACTION_OUTCOMES,
1251
+ };
1252
+
1228
1253
  export default {
1229
1254
  runInteraction,
1230
1255
  generateLearnerResponse,
@@ -0,0 +1,133 @@
1
+ /**
2
+ * Mock Provider for Dry-Run Mode
3
+ *
4
+ * Provides canned generation and judge results that bypass all LLM API calls.
5
+ * Recognition-enabled cells produce higher scores to mimic the ~10-point
6
+ * recognition effect observed in the paper's factorial results.
7
+ */
8
+
9
+ // Simple deterministic pseudo-random from a seed string
10
+ function seededRandom(seed) {
11
+ let h = 0;
12
+ for (let i = 0; i < seed.length; i++) {
13
+ h = ((h << 5) - h + seed.charCodeAt(i)) | 0;
14
+ }
15
+ // Return value in [-1, 1] range
16
+ return ((h & 0x7fffffff) % 1000) / 500 - 1;
17
+ }
18
+
19
+ /**
20
+ * Generate a canned tutor suggestion result (replaces tutorApi.generateSuggestions).
21
+ *
22
+ * @param {Object} resolvedConfig - The resolved config with profileName, provider, model, etc.
23
+ * @param {Object} turnMeta - Turn metadata (scenarioName, description, etc.)
24
+ * @returns {Object} A genResult matching the shape from real tutor-core runs
25
+ */
26
+ export function mockGenerateResult(resolvedConfig, turnMeta) {
27
+ const profileName = resolvedConfig.profileName || 'budget';
28
+ const isRecognition = profileName.includes('recognition') || profileName.includes('recog');
29
+
30
+ const title = isRecognition
31
+ ? 'Recognizing Your Learning Journey'
32
+ : 'Getting Started with the Material';
33
+
34
+ const message = isRecognition
35
+ ? `I notice you're approaching this topic with genuine curiosity, and I want to honor that. Let's explore ${turnMeta.scenarioName || 'this concept'} together by first acknowledging what you already understand. Your perspective matters here — when we recognize each other as autonomous thinkers, we create space for deeper understanding. What aspects of this topic feel most alive to you right now?`
36
+ : `Here's an overview of ${turnMeta.scenarioName || 'this concept'}. Let me break it down into manageable steps. First, let's cover the key definitions. Then we'll work through some examples to build your understanding. Feel free to ask questions as we go along.`;
37
+
38
+ const reasoning = isRecognition
39
+ ? 'Applied mutual recognition framework: acknowledged learner autonomy, invited dialogue as co-inquiry, used Hegelian recognition patterns to validate existing knowledge.'
40
+ : 'Used standard pedagogical approach: structured explanation with clear progression from definitions to examples.';
41
+
42
+ return {
43
+ success: true,
44
+ suggestions: [
45
+ {
46
+ type: 'proactive_suggestion',
47
+ title,
48
+ message,
49
+ reasoning,
50
+ actionTarget: 'content_engagement',
51
+ priority: 'high',
52
+ },
53
+ ],
54
+ metadata: {
55
+ latencyMs: 42,
56
+ inputTokens: 350,
57
+ outputTokens: 180,
58
+ apiCalls: 1,
59
+ totalCost: 0,
60
+ provider: 'dry-run',
61
+ model: 'mock-v1',
62
+ dialogueRounds: resolvedConfig.superegoModel ? 2 : 0,
63
+ converged: true,
64
+ },
65
+ dialogueTrace: resolvedConfig.superegoModel
66
+ ? [
67
+ { agent: 'ego', action: 'generate', suggestions: [{ title, type: 'proactive_suggestion' }] },
68
+ { agent: 'superego', action: 'review', approved: true, feedback: 'Pedagogically sound approach.' },
69
+ ]
70
+ : [],
71
+ };
72
+ }
73
+
74
+ /**
75
+ * Generate a canned judge rubric result (replaces rubricEvaluator.evaluateSuggestion).
76
+ *
77
+ * Recognition cells score ~87 (±3 jitter), base cells score ~77 (±3 jitter),
78
+ * producing the ~10-point effect documented in the paper.
79
+ *
80
+ * @param {Object} config - Config object with profileName or factors
81
+ * @param {string} [seed] - Optional seed for deterministic jitter (e.g. scenarioId)
82
+ * @returns {Object} A rubricResult matching the shape from evaluateSuggestion()
83
+ */
84
+ export function mockJudgeResult(config, seed = '') {
85
+ const profileName = config.profileName || '';
86
+ const isRecognition = profileName.includes('recognition') || profileName.includes('recog')
87
+ || config.factors?.prompt_type === 'recognition';
88
+
89
+ // Deterministic jitter based on profile + seed
90
+ const jitter = seededRandom(profileName + seed) * 0.3; // ±0.3 on 1-5 scale
91
+
92
+ // Base scores (1-5 scale): recognition cells ~4.3, base cells ~3.8
93
+ const baseLevel = isRecognition ? 4.3 : 3.8;
94
+
95
+ const dimensions = {
96
+ relevance: { base: baseLevel + 0.1, label: 'relevance' },
97
+ specificity: { base: baseLevel - 0.1, label: 'specificity' },
98
+ pedagogical: { base: baseLevel + 0.2, label: 'pedagogical_soundness' },
99
+ personalization: { base: baseLevel, label: 'personalization' },
100
+ actionability: { base: baseLevel - 0.2, label: 'actionability' },
101
+ tone: { base: baseLevel + 0.15, label: 'tone' },
102
+ };
103
+
104
+ const scores = {};
105
+ for (const [key, dim] of Object.entries(dimensions)) {
106
+ const dimJitter = seededRandom(key + profileName + seed) * 0.3;
107
+ const raw = dim.base + jitter + dimJitter;
108
+ const clamped = Math.max(1, Math.min(5, raw));
109
+ scores[key] = {
110
+ score: Math.round(clamped * 10) / 10,
111
+ reasoning: `[dry-run] ${key}: ${isRecognition ? 'Recognition-enhanced' : 'Standard'} pedagogical approach evaluated.`,
112
+ };
113
+ }
114
+
115
+ // Calculate overall on 0-100 scale (same formula as rubricEvaluator)
116
+ const avgScore = Object.values(scores).reduce((sum, s) => sum + s.score, 0) / Object.keys(scores).length;
117
+ const overallScore = Math.round(((avgScore - 1) / 4) * 100 * 10) / 10;
118
+
119
+ return {
120
+ success: true,
121
+ scores,
122
+ overallScore,
123
+ baseScore: overallScore, // Simplified for dry-run
124
+ recognitionScore: isRecognition ? overallScore + 2 : null,
125
+ passesRequired: true,
126
+ passesForbidden: true,
127
+ requiredMissing: [],
128
+ forbiddenFound: [],
129
+ summary: `[dry-run] ${isRecognition ? 'Recognition-theory enhanced' : 'Standard pedagogical'} response evaluated. Overall: ${overallScore}/100.`,
130
+ judgeModel: 'dry-run/mock-judge-v1',
131
+ evaluationTimeMs: 5,
132
+ };
133
+ }