@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -847,11 +847,386 @@ function calculateMemoryDelta(before, after) {
847
847
  };
848
848
  }
849
849
 
850
+ // ============================================================================
851
+ // Standalone Learner Response (for evaluation pipeline)
852
+ // ============================================================================
853
+
854
+ // Retry delays for 429 rate limits (matches evaluationRunner pattern)
855
+ const LEARNER_RETRY_DELAYS = [2000, 4000, 8000];
856
+
857
+ /**
858
+ * Call the LLM for a learner agent using the same raw fetch layer as
859
+ * tutorDialogueEngine.callAI — same headers, error handling, and response
860
+ * parsing per provider. This ensures learner and tutor calls go through
861
+ * identical network code paths.
862
+ *
863
+ * Includes built-in retry with exponential backoff for 429 rate limits.
864
+ *
865
+ * @param {Object} agentConfig - From learnerConfig.getAgentConfig()
866
+ * @param {string} systemPrompt - Static system/persona prompt (cacheable)
867
+ * @param {string} userPrompt - Dynamic per-call user content
868
+ * @param {string} agentRole - For logging (e.g. 'ego', 'superego', 'synthesis')
869
+ * @returns {Promise<Object>} { content, usage: { inputTokens, outputTokens }, latencyMs }
870
+ */
871
+ async function callLearnerAI(agentConfig, systemPrompt, userPrompt, agentRole = 'learner') {
872
+ let lastError;
873
+ for (let attempt = 0; attempt <= LEARNER_RETRY_DELAYS.length; attempt++) {
874
+ try {
875
+ return await _callLearnerAIOnce(agentConfig, systemPrompt, userPrompt, agentRole);
876
+ } catch (error) {
877
+ lastError = error;
878
+ const is429 = error?.message?.includes('429') ||
879
+ error?.message?.toLowerCase()?.includes('rate limit');
880
+ if (!is429 || attempt >= LEARNER_RETRY_DELAYS.length) throw error;
881
+ const delay = LEARNER_RETRY_DELAYS[attempt];
882
+ console.warn(`[${agentRole}] Rate limit hit, retrying in ${delay}ms (attempt ${attempt + 1}/${LEARNER_RETRY_DELAYS.length})`);
883
+ await new Promise(resolve => setTimeout(resolve, delay));
884
+ }
885
+ }
886
+ throw lastError;
887
+ }
888
+
889
+ /**
890
+ * Single-attempt LLM call. Mirrors tutorDialogueEngine.callAI per-provider
891
+ * fetch logic: same headers, same body format, same error parsing.
892
+ * Accepts system and user prompts separately for provider-level caching.
893
+ */
894
+ async function _callLearnerAIOnce(agentConfig, systemPrompt, userPrompt, agentRole) {
895
+ const { provider, providerConfig, model, hyperparameters = {} } = agentConfig;
896
+ let { temperature = 0.7, max_tokens = 300, top_p } = hyperparameters;
897
+
898
+ // Thinking models (kimi-k2.5, deepseek-r1, etc.) use reasoning tokens that consume
899
+ // the max_tokens budget. Increase significantly to allow for both reasoning and output.
900
+ const isThinkingModel = model?.includes('kimi-k2') || model?.includes('deepseek-r1');
901
+ if (isThinkingModel && max_tokens < 2000) {
902
+ max_tokens = 2000;
903
+ }
904
+
905
+ if (!providerConfig?.isConfigured) {
906
+ throw new Error(`Learner provider ${provider} not configured (missing API key)`);
907
+ }
908
+
909
+ const startTime = Date.now();
910
+
911
+ // --- Anthropic ---
912
+ if (provider === 'anthropic') {
913
+ const bodyParams = {
914
+ model,
915
+ max_tokens,
916
+ temperature,
917
+ system: systemPrompt,
918
+ messages: [{ role: 'user', content: userPrompt }],
919
+ };
920
+ if (top_p !== undefined) {
921
+ delete bodyParams.temperature;
922
+ bodyParams.top_p = top_p;
923
+ }
924
+
925
+ const res = await fetch(providerConfig.base_url, {
926
+ method: 'POST',
927
+ headers: {
928
+ 'Content-Type': 'application/json',
929
+ 'x-api-key': providerConfig.apiKey,
930
+ 'anthropic-version': '2023-06-01',
931
+ },
932
+ body: JSON.stringify(bodyParams),
933
+ });
934
+
935
+ if (!res.ok) {
936
+ const data = await res.json().catch(() => ({}));
937
+ throw new Error(`Anthropic API error: ${res.status} - ${data?.error?.message || 'Unknown error'}`);
938
+ }
939
+
940
+ const data = await res.json();
941
+ return {
942
+ content: data?.content?.[0]?.text?.trim() || '',
943
+ usage: {
944
+ inputTokens: data?.usage?.input_tokens || 0,
945
+ outputTokens: data?.usage?.output_tokens || 0,
946
+ },
947
+ latencyMs: Date.now() - startTime,
948
+ };
949
+ }
950
+
951
+ // --- OpenAI ---
952
+ if (provider === 'openai') {
953
+ const res = await fetch(providerConfig.base_url, {
954
+ method: 'POST',
955
+ headers: {
956
+ 'Content-Type': 'application/json',
957
+ Authorization: `Bearer ${providerConfig.apiKey}`,
958
+ },
959
+ body: JSON.stringify({
960
+ model,
961
+ temperature,
962
+ max_tokens,
963
+ top_p,
964
+ messages: [
965
+ { role: 'system', content: systemPrompt },
966
+ { role: 'user', content: userPrompt },
967
+ ],
968
+ }),
969
+ });
970
+
971
+ if (!res.ok) {
972
+ const data = await res.json().catch(() => ({}));
973
+ throw new Error(`OpenAI API error: ${res.status} - ${data?.error?.message || 'Unknown error'}`);
974
+ }
975
+
976
+ const data = await res.json();
977
+ return {
978
+ content: data?.choices?.[0]?.message?.content?.trim() || '',
979
+ usage: {
980
+ inputTokens: data?.usage?.prompt_tokens || 0,
981
+ outputTokens: data?.usage?.completion_tokens || 0,
982
+ },
983
+ latencyMs: Date.now() - startTime,
984
+ };
985
+ }
986
+
987
+ // --- OpenRouter ---
988
+ if (provider === 'openrouter') {
989
+ const res = await fetch(providerConfig.base_url, {
990
+ method: 'POST',
991
+ headers: {
992
+ 'Content-Type': 'application/json',
993
+ Authorization: `Bearer ${providerConfig.apiKey}`,
994
+ 'HTTP-Referer': process.env.OPENROUTER_REFERER || 'https://machine-spirits.com',
995
+ 'X-Title': 'Machine Spirits Tutor',
996
+ },
997
+ body: JSON.stringify({
998
+ model,
999
+ temperature,
1000
+ max_tokens,
1001
+ top_p,
1002
+ messages: [
1003
+ { role: 'system', content: systemPrompt },
1004
+ { role: 'user', content: userPrompt },
1005
+ ],
1006
+ }),
1007
+ });
1008
+
1009
+ if (!res.ok) {
1010
+ const data = await res.json().catch(() => ({}));
1011
+ throw new Error(`OpenRouter API error: ${res.status} - ${data?.error?.message || 'Unknown error'}`);
1012
+ }
1013
+
1014
+ const data = await res.json();
1015
+ const content = data?.choices?.[0]?.message?.content?.trim() || '';
1016
+
1017
+ if (!content) {
1018
+ console.warn(`[${agentRole}] OpenRouter returned empty content. Model: ${model}, finish_reason: ${data?.choices?.[0]?.finish_reason}`);
1019
+ }
1020
+
1021
+ return {
1022
+ content,
1023
+ usage: {
1024
+ inputTokens: data?.usage?.prompt_tokens || 0,
1025
+ outputTokens: data?.usage?.completion_tokens || 0,
1026
+ },
1027
+ latencyMs: Date.now() - startTime,
1028
+ };
1029
+ }
1030
+
1031
+ // --- Gemini ---
1032
+ if (provider === 'gemini') {
1033
+ const { GoogleGenAI } = await import('@google/genai');
1034
+ const gemini = new GoogleGenAI({ apiKey: providerConfig.apiKey });
1035
+
1036
+ const result = await gemini.models.generateContent({
1037
+ model,
1038
+ systemInstruction: systemPrompt,
1039
+ contents: [{ role: 'user', parts: [{ text: userPrompt }] }],
1040
+ config: { temperature, maxOutputTokens: max_tokens, topP: top_p },
1041
+ });
1042
+
1043
+ const content = result?.text?.() || result?.response?.text?.() || '';
1044
+ return {
1045
+ content,
1046
+ usage: { inputTokens: 0, outputTokens: 0 },
1047
+ latencyMs: Date.now() - startTime,
1048
+ };
1049
+ }
1050
+
1051
+ // --- Local (LM Studio / Ollama / llama.cpp) ---
1052
+ if (provider === 'local') {
1053
+ const res = await fetch(providerConfig.base_url, {
1054
+ method: 'POST',
1055
+ headers: { 'Content-Type': 'application/json' },
1056
+ body: JSON.stringify({
1057
+ model,
1058
+ temperature,
1059
+ max_tokens,
1060
+ messages: [
1061
+ { role: 'system', content: systemPrompt },
1062
+ { role: 'user', content: userPrompt },
1063
+ ],
1064
+ }),
1065
+ });
1066
+
1067
+ if (!res.ok) {
1068
+ const data = await res.json().catch(() => ({}));
1069
+ throw new Error(`Local LLM error: ${res.status} - ${data?.error?.message || 'Is LM Studio running?'}`);
1070
+ }
1071
+
1072
+ const data = await res.json();
1073
+ return {
1074
+ content: data?.choices?.[0]?.message?.content?.trim() || '',
1075
+ usage: {
1076
+ inputTokens: data?.usage?.prompt_tokens || 0,
1077
+ outputTokens: data?.usage?.completion_tokens || 0,
1078
+ },
1079
+ latencyMs: Date.now() - startTime,
1080
+ };
1081
+ }
1082
+
1083
+ throw new Error(`Unsupported learner provider: ${provider}`);
1084
+ }
1085
+
1086
+ /**
1087
+ * Generate a single learner response for use by the evaluation pipeline.
1088
+ * Runs ego→superego→synthesis if profile is multi-agent, or single call if unified.
1089
+ *
1090
+ * Uses callLearnerAI internally — the same raw fetch layer as the tutor's
1091
+ * tutorDialogueEngine.callAI — so learner and tutor LLM calls go through
1092
+ * identical provider code paths with identical retry logic.
1093
+ *
1094
+ * @param {Object} options
1095
+ * @param {string} options.tutorMessage - The tutor's message to respond to
1096
+ * @param {string} options.topic - Current topic
1097
+ * @param {Array} options.conversationHistory - [{role, content}, ...]
1098
+ * @param {string} options.learnerProfile - Profile name ('ego_superego' or 'unified')
1099
+ * @param {string} options.personaId - Persona identifier (default: 'eager_novice')
1100
+ * @param {string|Object} [options.modelOverride] - Optional model override (e.g. 'openrouter.nemotron') applied to all learner agents
1101
+ * @returns {Promise<Object>} { message, internalDeliberation, emotionalState, understandingLevel, tokenUsage }
1102
+ */
1103
+ export async function generateLearnerResponse(options) {
1104
+ const {
1105
+ tutorMessage,
1106
+ topic,
1107
+ conversationHistory = [],
1108
+ learnerProfile = 'unified',
1109
+ personaId = 'eager_novice',
1110
+ modelOverride,
1111
+ } = options;
1112
+
1113
+ // Resolve model override once (if provided) so all learner agents use the same model
1114
+ let resolvedOverride = null;
1115
+ if (modelOverride) {
1116
+ const r = learnerConfig.resolveModel(modelOverride);
1117
+ const providerConfig = learnerConfig.getProviderConfig(r.provider);
1118
+ const modelFullId = providerConfig.models?.[r.model] || r.model;
1119
+ resolvedOverride = { provider: r.provider, providerConfig, model: modelFullId, modelAlias: r.model };
1120
+ }
1121
+
1122
+ const applyOverride = (cfg) => {
1123
+ if (!resolvedOverride || !cfg) return cfg;
1124
+ return { ...cfg, provider: resolvedOverride.provider, providerConfig: resolvedOverride.providerConfig, model: resolvedOverride.model, modelAlias: resolvedOverride.modelAlias };
1125
+ };
1126
+
1127
+ const persona = learnerConfig.getPersona(personaId);
1128
+ const profile = learnerConfig.getActiveProfile(learnerProfile);
1129
+ const agentRoles = learnerConfig.getProfileAgentRoles(profile.name);
1130
+ const internalDeliberation = [];
1131
+ const tokenUsage = { inputTokens: 0, outputTokens: 0, apiCalls: 0 };
1132
+
1133
+ // Build conversation context string from history
1134
+ const conversationContext = conversationHistory
1135
+ .slice(-6)
1136
+ .map(m => `${m.role.toUpperCase()}: ${m.content}`)
1137
+ .join('\n\n');
1138
+
1139
+ // Psychodynamic flow: Ego (initial) → Superego (critique) → Ego (revision/final)
1140
+ // This mirrors the tutor architecture where the ego has final authority over output,
1141
+ // accepting, rejecting, or modifying the superego's suggestions.
1142
+
1143
+ const hasMultiAgent = agentRoles.includes('ego') && agentRoles.includes('superego');
1144
+
1145
+ if (hasMultiAgent) {
1146
+ // === STEP 1: Ego initial reaction ===
1147
+ const egoConfig = applyOverride(learnerConfig.getAgentConfig('ego', profile.name));
1148
+ const egoContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"\n\nGenerate your initial internal reaction as the learner's ego.`;
1149
+ const egoSystemPrompt = buildLearnerPrompt(egoConfig, persona, egoContext);
1150
+
1151
+ const egoInitialResponse = await callLearnerAI(egoConfig, egoSystemPrompt, "React to the tutor's message.", 'learner_ego_initial');
1152
+ internalDeliberation.push({ role: 'ego_initial', content: egoInitialResponse.content });
1153
+ tokenUsage.inputTokens += egoInitialResponse.usage?.inputTokens || 0;
1154
+ tokenUsage.outputTokens += egoInitialResponse.usage?.outputTokens || 0;
1155
+ tokenUsage.apiCalls++;
1156
+
1157
+ // === STEP 2: Superego critique ===
1158
+ const superegoConfig = applyOverride(learnerConfig.getAgentConfig('superego', profile.name));
1159
+ const superegoContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"\n\nThe EGO's initial reaction was:\n"${egoInitialResponse.content}"\n\nReview the EGO's response. Is it accurate? What's being missed? What should be reconsidered?`;
1160
+ const superegoSystemPrompt = buildLearnerPrompt(superegoConfig, persona, superegoContext);
1161
+
1162
+ const superegoResponse = await callLearnerAI(superegoConfig, superegoSystemPrompt, "Critique the EGO's reaction.", 'learner_superego');
1163
+ internalDeliberation.push({ role: 'superego', content: superegoResponse.content });
1164
+ tokenUsage.inputTokens += superegoResponse.usage?.inputTokens || 0;
1165
+ tokenUsage.outputTokens += superegoResponse.usage?.outputTokens || 0;
1166
+ tokenUsage.apiCalls++;
1167
+
1168
+ // === STEP 3: Ego revision (final authority) ===
1169
+ // The ego considers the superego's feedback and decides what to actually say.
1170
+ // It may accept, reject, or modify the superego's suggestions.
1171
+ const egoRevisionContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"\n\nYour initial reaction was:\n"${egoInitialResponse.content}"\n\nThe SUPEREGO's critique:\n"${superegoResponse.content}"\n\nConsider the superego's feedback. You have final authority — accept, reject, or modify its suggestions as you see fit. Then produce a realistic external response (1-4 sentences) that the learner would actually say to the tutor.`;
1172
+ const egoRevisionSystemPrompt = buildLearnerPrompt(egoConfig, persona, egoRevisionContext);
1173
+
1174
+ const egoFinalResponse = await callLearnerAI(egoConfig, egoRevisionSystemPrompt, "Produce your final response to the tutor.", 'learner_ego_revision');
1175
+ internalDeliberation.push({ role: 'ego_revision', content: egoFinalResponse.content });
1176
+ tokenUsage.inputTokens += egoFinalResponse.usage?.inputTokens || 0;
1177
+ tokenUsage.outputTokens += egoFinalResponse.usage?.outputTokens || 0;
1178
+ tokenUsage.apiCalls++;
1179
+
1180
+ // Log deliberation for debugging/analysis
1181
+ if (process.env.LEARNER_DEBUG) {
1182
+ console.log('\n┌─────────────────────────────────────────────────────────────');
1183
+ console.log('│ LEARNER DELIBERATION (ego→superego→ego_revision)');
1184
+ console.log('├─────────────────────────────────────────────────────────────');
1185
+ console.log(`│ EGO INITIAL: ${egoInitialResponse.content.substring(0, 200)}...`);
1186
+ console.log('├─────────────────────────────────────────────────────────────');
1187
+ console.log(`│ SUPEREGO: ${superegoResponse.content.substring(0, 200)}...`);
1188
+ console.log('├─────────────────────────────────────────────────────────────');
1189
+ console.log(`│ EGO REVISION (FINAL): ${egoFinalResponse.content.substring(0, 200)}...`);
1190
+ console.log('└─────────────────────────────────────────────────────────────\n');
1191
+ }
1192
+ } else {
1193
+ // Single-agent (unified) flow — run each role sequentially as before
1194
+ for (const role of agentRoles) {
1195
+ const agentConfig = applyOverride(learnerConfig.getAgentConfig(role, profile.name));
1196
+ if (!agentConfig) continue;
1197
+
1198
+ let roleContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"`;
1199
+ roleContext += `\n\nGenerate your internal reaction as this dimension of the learner's experience.`;
1200
+
1201
+ const systemPrompt = buildLearnerPrompt(agentConfig, persona, roleContext);
1202
+ const response = await callLearnerAI(agentConfig, systemPrompt, "React to the tutor's message.", `learner_${role}`);
1203
+
1204
+ internalDeliberation.push({ role, content: response.content });
1205
+ tokenUsage.inputTokens += response.usage?.inputTokens || 0;
1206
+ tokenUsage.outputTokens += response.usage?.outputTokens || 0;
1207
+ tokenUsage.apiCalls++;
1208
+ }
1209
+ }
1210
+
1211
+ // Get final message from the last deliberation step
1212
+ // For multi-agent: ego_revision. For unified: the single agent's output.
1213
+ const finalDeliberation = internalDeliberation[internalDeliberation.length - 1];
1214
+
1215
+ return {
1216
+ message: finalDeliberation.content,
1217
+ internalDeliberation,
1218
+ emotionalState: detectEmotionalState(internalDeliberation),
1219
+ understandingLevel: detectUnderstandingLevel(internalDeliberation),
1220
+ tokenUsage,
1221
+ };
1222
+ }
1223
+
850
1224
  // ============================================================================
851
1225
  // Exports
852
1226
  // ============================================================================
853
1227
 
854
1228
  export default {
855
1229
  runInteraction,
1230
+ generateLearnerResponse,
856
1231
  INTERACTION_OUTCOMES,
857
1232
  };
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Process utility functions shared across eval services and CLI.
3
+ */
4
+
5
+ /**
6
+ * Check if a process with the given PID is still running.
7
+ * @param {number} pid - Process ID to check
8
+ * @returns {boolean|null} true if alive, false if dead, null if pid is falsy
9
+ */
10
+ export function isPidAlive(pid) {
11
+ if (!pid || typeof pid !== 'number') return null;
12
+ try {
13
+ process.kill(pid, 0); // Signal 0 = check existence without killing
14
+ return true;
15
+ } catch (e) {
16
+ return e.code === 'EPERM'; // EPERM means process exists but we can't signal it
17
+ }
18
+ }
@@ -0,0 +1,98 @@
1
+ /**
2
+ * Progress Logger — JSONL event writer for cross-process eval monitoring.
3
+ *
4
+ * One file per run at logs/eval-progress/<runId>.jsonl.
5
+ * Each line is a self-contained JSON object with timestamp + runId + eventType.
6
+ */
7
+
8
+ import fs from 'fs';
9
+ import path from 'path';
10
+ import { fileURLToPath } from 'url';
11
+
12
+ const __filename = fileURLToPath(import.meta.url);
13
+ const __dirname = path.dirname(__filename);
14
+ const ROOT_DIR = path.resolve(__dirname, '..');
15
+ const PROGRESS_DIR = path.join(ROOT_DIR, 'logs', 'eval-progress');
16
+
17
+ export class ProgressLogger {
18
+ constructor(runId) {
19
+ this.runId = runId;
20
+ // Ensure directory exists
21
+ fs.mkdirSync(PROGRESS_DIR, { recursive: true });
22
+ this.filePath = path.join(PROGRESS_DIR, `${runId}.jsonl`);
23
+ }
24
+
25
+ /** Append a single JSON line */
26
+ writeEvent(eventType, data = {}) {
27
+ const event = {
28
+ timestamp: new Date().toISOString(),
29
+ runId: this.runId,
30
+ eventType,
31
+ ...data,
32
+ };
33
+ fs.appendFileSync(this.filePath, JSON.stringify(event) + '\n');
34
+ }
35
+
36
+ // ── Convenience methods ──────────────────────────────────────────
37
+
38
+ runStart({ totalTests, totalScenarios, totalConfigurations, scenarios, profiles, description }) {
39
+ this.writeEvent('run_start', {
40
+ totalTests,
41
+ totalScenarios,
42
+ totalConfigurations,
43
+ scenarios,
44
+ profiles,
45
+ description,
46
+ });
47
+ }
48
+
49
+ testStart({ scenarioId, scenarioName, profileName }) {
50
+ this.writeEvent('test_start', { scenarioId, scenarioName, profileName });
51
+ }
52
+
53
+ testComplete({ scenarioId, scenarioName, profileName, success, overallScore, baseScore, recognitionScore, latencyMs, completedCount, totalTests }) {
54
+ this.writeEvent('test_complete', {
55
+ scenarioId, scenarioName, profileName,
56
+ success, overallScore, baseScore, recognitionScore, latencyMs,
57
+ completedCount, totalTests,
58
+ });
59
+ }
60
+
61
+ testError({ scenarioId, scenarioName, profileName, errorMessage, completedCount, totalTests }) {
62
+ this.writeEvent('test_error', {
63
+ scenarioId, scenarioName, profileName, errorMessage,
64
+ completedCount, totalTests,
65
+ });
66
+ }
67
+
68
+ scenarioComplete({ scenarioId, scenarioName, profileNames, avgScore, completedScenarios, totalScenarios }) {
69
+ this.writeEvent('scenario_complete', {
70
+ scenarioId, scenarioName, profileNames, avgScore,
71
+ completedScenarios, totalScenarios,
72
+ });
73
+ }
74
+
75
+ runComplete({ totalTests, successfulTests, failedTests, durationMs }) {
76
+ this.writeEvent('run_complete', {
77
+ totalTests, successfulTests, failedTests, durationMs,
78
+ });
79
+ }
80
+ }
81
+
82
+ /** Resolve the JSONL path for a given runId (may not exist yet). */
83
+ export function getProgressLogPath(runId) {
84
+ return path.join(PROGRESS_DIR, `${runId}.jsonl`);
85
+ }
86
+
87
+ /** Read all events from a JSONL progress file. Returns [] if missing. */
88
+ export function readProgressLog(runId) {
89
+ const filePath = path.join(PROGRESS_DIR, `${runId}.jsonl`);
90
+ if (!fs.existsSync(filePath)) return [];
91
+ const lines = fs.readFileSync(filePath, 'utf-8').split('\n').filter(Boolean);
92
+ return lines.map(line => {
93
+ try { return JSON.parse(line); }
94
+ catch { return null; }
95
+ }).filter(Boolean);
96
+ }
97
+
98
+ export default { ProgressLogger, getProgressLogPath, readProgressLog };
@@ -2,18 +2,17 @@
2
2
  * Prompt Recommendation Service
3
3
  *
4
4
  * Analyzes evaluation results and generates recommendations to improve
5
- * tutor prompts. Uses a powerful evaluator model to analyze failures
5
+ * tutor prompts. Uses a powerful recommender model to analyze failures
6
6
  * and weaknesses from weaker tutor models.
7
7
  *
8
- * Evaluator configuration is loaded from config/evaluation-rubric.yaml
8
+ * Recommender configuration is loaded from config/evaluation-rubric.yaml
9
9
  * Provider details are resolved from config/providers.yaml
10
10
  */
11
11
 
12
12
  import fs from 'fs';
13
13
  import path from 'path';
14
14
  import { fileURLToPath } from 'url';
15
- import Anthropic from '@anthropic-ai/sdk';
16
- import { tutorApiService as tutorApi, tutorConfigLoader as configLoader } from '@machinespirits/tutor-core';
15
+ import * as evalConfigLoader from './evalConfigLoader.js';
17
16
 
18
17
  const __filename = fileURLToPath(import.meta.url);
19
18
  const __dirname = path.dirname(__filename);
@@ -22,12 +21,11 @@ const PROMPTS_DIR = path.join(ROOT_DIR, 'prompts');
22
21
 
23
22
  /**
24
23
  * Get recommender config, resolving model references via providers.yaml
25
- * Uses 'recommender' config from evaluation-rubric.yaml (falls back to 'evaluator')
24
+ * Uses 'recommender' config from evaluation-rubric.yaml
26
25
  */
27
- function getEvaluatorConfig() {
28
- const rubric = tutorApi.loadRubric();
29
- // Prefer 'recommender' for prompt analysis, fall back to legacy 'evaluator'
30
- const evalConfig = rubric?.recommender || rubric?.evaluator;
26
+ function getRecommenderConfig() {
27
+ const rubric = evalConfigLoader.loadRubric();
28
+ const evalConfig = rubric?.recommender;
31
29
 
32
30
  if (!evalConfig?.model) {
33
31
  console.warn('[promptRecommendation] No recommender in evaluation-rubric.yaml, using defaults');
@@ -40,7 +38,7 @@ function getEvaluatorConfig() {
40
38
 
41
39
  // Try to resolve primary model
42
40
  try {
43
- const resolved = configLoader.resolveModel(evalConfig.model);
41
+ const resolved = evalConfigLoader.resolveModel(evalConfig.model);
44
42
  if (resolved.isConfigured) {
45
43
  return {
46
44
  provider: resolved.provider,
@@ -57,7 +55,7 @@ function getEvaluatorConfig() {
57
55
  // Try fallback
58
56
  if (evalConfig.fallback?.model) {
59
57
  try {
60
- const fallback = configLoader.resolveModel(evalConfig.fallback.model);
58
+ const fallback = evalConfigLoader.resolveModel(evalConfig.fallback.model);
61
59
  if (fallback.isConfigured) {
62
60
  console.log(`[promptRecommendation] Using fallback: ${fallback.provider}/${fallback.model}`);
63
61
  return {
@@ -74,7 +72,7 @@ function getEvaluatorConfig() {
74
72
  }
75
73
 
76
74
  // Return primary anyway - will fail with helpful error
77
- const resolved = configLoader.resolveModel(evalConfig.model);
75
+ const resolved = evalConfigLoader.resolveModel(evalConfig.model);
78
76
  return {
79
77
  provider: resolved.provider,
80
78
  model: resolved.model,
@@ -157,7 +155,7 @@ function analyzeResults(results) {
157
155
  }
158
156
 
159
157
  /**
160
- * Build the analysis prompt for the evaluator
158
+ * Build the analysis prompt for the recommender
161
159
  */
162
160
  function buildAnalysisPrompt(analysis, egoPrompt, superegoPrompt, profileName) {
163
161
  const sections = [];
@@ -273,14 +271,14 @@ Be specific and actionable. Quote exact text to change when possible.
273
271
  }
274
272
 
275
273
  /**
276
- * Call the evaluator model to generate recommendations
274
+ * Call the recommender model to generate recommendations
277
275
  * Uses config from evaluation-rubric.yaml
278
276
  */
279
- async function callEvaluator(prompt, options = {}) {
277
+ async function callRecommender(prompt, options = {}) {
280
278
  const { budget = false } = options;
281
279
 
282
280
  // Get config from yaml (handles fallbacks automatically)
283
- const config = getEvaluatorConfig();
281
+ const config = getRecommenderConfig();
284
282
  const { provider, model, hyperparameters } = config;
285
283
  const maxTokens = hyperparameters?.max_tokens ?? 4000;
286
284
  const temperature = hyperparameters?.temperature ?? 0.3;
@@ -298,6 +296,13 @@ async function callEvaluator(prompt, options = {}) {
298
296
  throw new Error('ANTHROPIC_API_KEY not set');
299
297
  }
300
298
 
299
+ let Anthropic;
300
+ try {
301
+ Anthropic = (await import('@anthropic-ai/sdk')).default;
302
+ } catch {
303
+ throw new Error('@anthropic-ai/sdk is not installed. Install it to use the Anthropic provider for recommendations.');
304
+ }
305
+
301
306
  const client = new Anthropic({ apiKey });
302
307
 
303
308
  const response = await client.messages.create({
@@ -377,9 +382,9 @@ async function callOpenRouterEvaluator(prompt, model, options = {}) {
377
382
  * @param {string} options.profileName - Profile that was evaluated
378
383
  * @param {string} options.egoPromptFile - Ego prompt file to analyze
379
384
  * @param {string} options.superegoPromptFile - Superego prompt file to analyze
380
- * @param {string} options.evaluatorModel - Model to use for analysis (default: claude-sonnet-4)
381
- * @param {string} options.evaluatorProvider - Provider: 'anthropic' or 'openrouter'
382
- * @param {boolean} options.budget - Use budget evaluator model
385
+ * @param {string} options.recommenderModel - Model to use for analysis (default: claude-sonnet-4)
386
+ * @param {string} options.recommenderProvider - Provider: 'anthropic' or 'openrouter'
387
+ * @param {boolean} options.budget - Use budget recommender model
383
388
  * @returns {Promise<Object>} Recommendations
384
389
  */
385
390
  export async function generateRecommendations(options = {}) {
@@ -388,8 +393,8 @@ export async function generateRecommendations(options = {}) {
388
393
  profileName = 'unknown',
389
394
  egoPromptFile = 'tutor-ego.md',
390
395
  superegoPromptFile = 'tutor-superego.md',
391
- evaluatorModel = null,
392
- evaluatorProvider = 'anthropic',
396
+ recommenderModel = null,
397
+ recommenderProvider = 'anthropic',
393
398
  budget = false,
394
399
  } = options;
395
400
 
@@ -421,18 +426,18 @@ export async function generateRecommendations(options = {}) {
421
426
  // Build analysis prompt
422
427
  const analysisPrompt = buildAnalysisPrompt(analysis, egoPrompt, superegoPrompt, profileName);
423
428
 
424
- // Get evaluator config from yaml
425
- const evalConfig = getEvaluatorConfig();
429
+ // Get recommender config from yaml
430
+ const evalConfig = getRecommenderConfig();
426
431
  console.log(`\nGenerating recommendations using ${evalConfig.provider}/${evalConfig.model}...`);
427
432
 
428
- const evalResult = await callEvaluator(analysisPrompt);
433
+ const evalResult = await callRecommender(analysisPrompt);
429
434
 
430
435
  return {
431
436
  success: true,
432
437
  needsImprovement: true,
433
438
  analysis,
434
439
  recommendations: evalResult.content,
435
- evaluatorModel: evalResult.model,
440
+ recommenderModel: evalResult.model,
436
441
  usage: {
437
442
  inputTokens: evalResult.inputTokens,
438
443
  outputTokens: evalResult.outputTokens,
@@ -479,7 +484,7 @@ export function formatRecommendations(result) {
479
484
  lines.push(result.recommendations);
480
485
  lines.push('');
481
486
  lines.push('─'.repeat(80));
482
- lines.push(`Evaluator: ${result.evaluatorModel}`);
487
+ lines.push(`Recommender: ${result.recommenderModel}`);
483
488
  lines.push(`Tokens: ${result.usage.inputTokens} in / ${result.usage.outputTokens} out`);
484
489
  lines.push('═'.repeat(80));
485
490