@machinespirits/eval 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +161 -0
- package/config/eval-settings.yaml +18 -0
- package/config/evaluation-rubric-learner.yaml +277 -0
- package/config/evaluation-rubric.yaml +613 -0
- package/config/interaction-eval-scenarios.yaml +93 -50
- package/config/learner-agents.yaml +124 -193
- package/config/machinespirits-eval.code-workspace +11 -0
- package/config/providers.yaml +60 -0
- package/config/suggestion-scenarios.yaml +1399 -0
- package/config/tutor-agents.yaml +716 -0
- package/docs/EVALUATION-VARIABLES.md +589 -0
- package/docs/REPLICATION-PLAN.md +577 -0
- package/index.js +15 -6
- package/package.json +16 -22
- package/routes/evalRoutes.js +88 -36
- package/scripts/analyze-judge-reliability.js +401 -0
- package/scripts/analyze-run.js +97 -0
- package/scripts/analyze-run.mjs +282 -0
- package/scripts/analyze-validation-failures.js +141 -0
- package/scripts/check-run.mjs +17 -0
- package/scripts/code-impasse-strategies.js +1132 -0
- package/scripts/compare-runs.js +44 -0
- package/scripts/compare-suggestions.js +80 -0
- package/scripts/compare-transformation.js +116 -0
- package/scripts/dig-into-run.js +158 -0
- package/scripts/eval-cli.js +2626 -0
- package/scripts/generate-paper-figures.py +452 -0
- package/scripts/qualitative-analysis-ai.js +1313 -0
- package/scripts/qualitative-analysis.js +688 -0
- package/scripts/seed-db.js +87 -0
- package/scripts/show-failed-suggestions.js +64 -0
- package/scripts/validate-content.js +192 -0
- package/server.js +3 -2
- package/services/__tests__/evalConfigLoader.test.js +338 -0
- package/services/anovaStats.js +499 -0
- package/services/contentResolver.js +407 -0
- package/services/dialogueTraceAnalyzer.js +454 -0
- package/services/evalConfigLoader.js +625 -0
- package/services/evaluationRunner.js +2171 -270
- package/services/evaluationStore.js +564 -29
- package/services/learnerConfigLoader.js +75 -5
- package/services/learnerRubricEvaluator.js +284 -0
- package/services/learnerTutorInteractionEngine.js +375 -0
- package/services/processUtils.js +18 -0
- package/services/progressLogger.js +98 -0
- package/services/promptRecommendationService.js +31 -26
- package/services/promptRewriter.js +427 -0
- package/services/rubricEvaluator.js +543 -70
- package/services/streamingReporter.js +104 -0
- package/services/turnComparisonAnalyzer.js +494 -0
- package/components/MobileEvalDashboard.tsx +0 -267
- package/components/comparison/DeltaAnalysisTable.tsx +0 -137
- package/components/comparison/ProfileComparisonCard.tsx +0 -176
- package/components/comparison/RecognitionABMode.tsx +0 -385
- package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
- package/components/comparison/WinnerIndicator.tsx +0 -64
- package/components/comparison/index.ts +0 -5
- package/components/mobile/BottomSheet.tsx +0 -233
- package/components/mobile/DimensionBreakdown.tsx +0 -210
- package/components/mobile/DocsView.tsx +0 -363
- package/components/mobile/LogsView.tsx +0 -481
- package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
- package/components/mobile/QuickTestView.tsx +0 -1098
- package/components/mobile/RecognitionTypeChart.tsx +0 -124
- package/components/mobile/RecognitionView.tsx +0 -809
- package/components/mobile/RunDetailView.tsx +0 -261
- package/components/mobile/RunHistoryView.tsx +0 -367
- package/components/mobile/ScoreRadial.tsx +0 -211
- package/components/mobile/StreamingLogPanel.tsx +0 -230
- package/components/mobile/SynthesisStrategyChart.tsx +0 -140
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
- package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
- package/docs/research/COST-ANALYSIS.md +0 -56
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
- package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
- package/docs/research/PAPER-UNIFIED.md +0 -659
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
- package/docs/research/apa.csl +0 -2133
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
- package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
- package/docs/research/paper-draft/full-paper.md +0 -136
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +0 -515
- package/docs/research/transcript-baseline.md +0 -139
- package/docs/research/transcript-recognition-multiagent.md +0 -187
- package/hooks/useEvalData.ts +0 -625
- package/server-init.js +0 -45
- package/services/benchmarkService.js +0 -1892
- package/types.ts +0 -165
- package/utils/haptics.ts +0 -45
|
@@ -847,11 +847,386 @@ function calculateMemoryDelta(before, after) {
|
|
|
847
847
|
};
|
|
848
848
|
}
|
|
849
849
|
|
|
850
|
+
// ============================================================================
|
|
851
|
+
// Standalone Learner Response (for evaluation pipeline)
|
|
852
|
+
// ============================================================================
|
|
853
|
+
|
|
854
|
+
// Retry delays for 429 rate limits (matches evaluationRunner pattern)
|
|
855
|
+
const LEARNER_RETRY_DELAYS = [2000, 4000, 8000];
|
|
856
|
+
|
|
857
|
+
/**
|
|
858
|
+
* Call the LLM for a learner agent using the same raw fetch layer as
|
|
859
|
+
* tutorDialogueEngine.callAI — same headers, error handling, and response
|
|
860
|
+
* parsing per provider. This ensures learner and tutor calls go through
|
|
861
|
+
* identical network code paths.
|
|
862
|
+
*
|
|
863
|
+
* Includes built-in retry with exponential backoff for 429 rate limits.
|
|
864
|
+
*
|
|
865
|
+
* @param {Object} agentConfig - From learnerConfig.getAgentConfig()
|
|
866
|
+
* @param {string} systemPrompt - Static system/persona prompt (cacheable)
|
|
867
|
+
* @param {string} userPrompt - Dynamic per-call user content
|
|
868
|
+
* @param {string} agentRole - For logging (e.g. 'ego', 'superego', 'synthesis')
|
|
869
|
+
* @returns {Promise<Object>} { content, usage: { inputTokens, outputTokens }, latencyMs }
|
|
870
|
+
*/
|
|
871
|
+
async function callLearnerAI(agentConfig, systemPrompt, userPrompt, agentRole = 'learner') {
|
|
872
|
+
let lastError;
|
|
873
|
+
for (let attempt = 0; attempt <= LEARNER_RETRY_DELAYS.length; attempt++) {
|
|
874
|
+
try {
|
|
875
|
+
return await _callLearnerAIOnce(agentConfig, systemPrompt, userPrompt, agentRole);
|
|
876
|
+
} catch (error) {
|
|
877
|
+
lastError = error;
|
|
878
|
+
const is429 = error?.message?.includes('429') ||
|
|
879
|
+
error?.message?.toLowerCase()?.includes('rate limit');
|
|
880
|
+
if (!is429 || attempt >= LEARNER_RETRY_DELAYS.length) throw error;
|
|
881
|
+
const delay = LEARNER_RETRY_DELAYS[attempt];
|
|
882
|
+
console.warn(`[${agentRole}] Rate limit hit, retrying in ${delay}ms (attempt ${attempt + 1}/${LEARNER_RETRY_DELAYS.length})`);
|
|
883
|
+
await new Promise(resolve => setTimeout(resolve, delay));
|
|
884
|
+
}
|
|
885
|
+
}
|
|
886
|
+
throw lastError;
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
/**
|
|
890
|
+
* Single-attempt LLM call. Mirrors tutorDialogueEngine.callAI per-provider
|
|
891
|
+
* fetch logic: same headers, same body format, same error parsing.
|
|
892
|
+
* Accepts system and user prompts separately for provider-level caching.
|
|
893
|
+
*/
|
|
894
|
+
async function _callLearnerAIOnce(agentConfig, systemPrompt, userPrompt, agentRole) {
|
|
895
|
+
const { provider, providerConfig, model, hyperparameters = {} } = agentConfig;
|
|
896
|
+
let { temperature = 0.7, max_tokens = 300, top_p } = hyperparameters;
|
|
897
|
+
|
|
898
|
+
// Thinking models (kimi-k2.5, deepseek-r1, etc.) use reasoning tokens that consume
|
|
899
|
+
// the max_tokens budget. Increase significantly to allow for both reasoning and output.
|
|
900
|
+
const isThinkingModel = model?.includes('kimi-k2') || model?.includes('deepseek-r1');
|
|
901
|
+
if (isThinkingModel && max_tokens < 2000) {
|
|
902
|
+
max_tokens = 2000;
|
|
903
|
+
}
|
|
904
|
+
|
|
905
|
+
if (!providerConfig?.isConfigured) {
|
|
906
|
+
throw new Error(`Learner provider ${provider} not configured (missing API key)`);
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
const startTime = Date.now();
|
|
910
|
+
|
|
911
|
+
// --- Anthropic ---
|
|
912
|
+
if (provider === 'anthropic') {
|
|
913
|
+
const bodyParams = {
|
|
914
|
+
model,
|
|
915
|
+
max_tokens,
|
|
916
|
+
temperature,
|
|
917
|
+
system: systemPrompt,
|
|
918
|
+
messages: [{ role: 'user', content: userPrompt }],
|
|
919
|
+
};
|
|
920
|
+
if (top_p !== undefined) {
|
|
921
|
+
delete bodyParams.temperature;
|
|
922
|
+
bodyParams.top_p = top_p;
|
|
923
|
+
}
|
|
924
|
+
|
|
925
|
+
const res = await fetch(providerConfig.base_url, {
|
|
926
|
+
method: 'POST',
|
|
927
|
+
headers: {
|
|
928
|
+
'Content-Type': 'application/json',
|
|
929
|
+
'x-api-key': providerConfig.apiKey,
|
|
930
|
+
'anthropic-version': '2023-06-01',
|
|
931
|
+
},
|
|
932
|
+
body: JSON.stringify(bodyParams),
|
|
933
|
+
});
|
|
934
|
+
|
|
935
|
+
if (!res.ok) {
|
|
936
|
+
const data = await res.json().catch(() => ({}));
|
|
937
|
+
throw new Error(`Anthropic API error: ${res.status} - ${data?.error?.message || 'Unknown error'}`);
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
const data = await res.json();
|
|
941
|
+
return {
|
|
942
|
+
content: data?.content?.[0]?.text?.trim() || '',
|
|
943
|
+
usage: {
|
|
944
|
+
inputTokens: data?.usage?.input_tokens || 0,
|
|
945
|
+
outputTokens: data?.usage?.output_tokens || 0,
|
|
946
|
+
},
|
|
947
|
+
latencyMs: Date.now() - startTime,
|
|
948
|
+
};
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
// --- OpenAI ---
|
|
952
|
+
if (provider === 'openai') {
|
|
953
|
+
const res = await fetch(providerConfig.base_url, {
|
|
954
|
+
method: 'POST',
|
|
955
|
+
headers: {
|
|
956
|
+
'Content-Type': 'application/json',
|
|
957
|
+
Authorization: `Bearer ${providerConfig.apiKey}`,
|
|
958
|
+
},
|
|
959
|
+
body: JSON.stringify({
|
|
960
|
+
model,
|
|
961
|
+
temperature,
|
|
962
|
+
max_tokens,
|
|
963
|
+
top_p,
|
|
964
|
+
messages: [
|
|
965
|
+
{ role: 'system', content: systemPrompt },
|
|
966
|
+
{ role: 'user', content: userPrompt },
|
|
967
|
+
],
|
|
968
|
+
}),
|
|
969
|
+
});
|
|
970
|
+
|
|
971
|
+
if (!res.ok) {
|
|
972
|
+
const data = await res.json().catch(() => ({}));
|
|
973
|
+
throw new Error(`OpenAI API error: ${res.status} - ${data?.error?.message || 'Unknown error'}`);
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
const data = await res.json();
|
|
977
|
+
return {
|
|
978
|
+
content: data?.choices?.[0]?.message?.content?.trim() || '',
|
|
979
|
+
usage: {
|
|
980
|
+
inputTokens: data?.usage?.prompt_tokens || 0,
|
|
981
|
+
outputTokens: data?.usage?.completion_tokens || 0,
|
|
982
|
+
},
|
|
983
|
+
latencyMs: Date.now() - startTime,
|
|
984
|
+
};
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
// --- OpenRouter ---
|
|
988
|
+
if (provider === 'openrouter') {
|
|
989
|
+
const res = await fetch(providerConfig.base_url, {
|
|
990
|
+
method: 'POST',
|
|
991
|
+
headers: {
|
|
992
|
+
'Content-Type': 'application/json',
|
|
993
|
+
Authorization: `Bearer ${providerConfig.apiKey}`,
|
|
994
|
+
'HTTP-Referer': process.env.OPENROUTER_REFERER || 'https://machine-spirits.com',
|
|
995
|
+
'X-Title': 'Machine Spirits Tutor',
|
|
996
|
+
},
|
|
997
|
+
body: JSON.stringify({
|
|
998
|
+
model,
|
|
999
|
+
temperature,
|
|
1000
|
+
max_tokens,
|
|
1001
|
+
top_p,
|
|
1002
|
+
messages: [
|
|
1003
|
+
{ role: 'system', content: systemPrompt },
|
|
1004
|
+
{ role: 'user', content: userPrompt },
|
|
1005
|
+
],
|
|
1006
|
+
}),
|
|
1007
|
+
});
|
|
1008
|
+
|
|
1009
|
+
if (!res.ok) {
|
|
1010
|
+
const data = await res.json().catch(() => ({}));
|
|
1011
|
+
throw new Error(`OpenRouter API error: ${res.status} - ${data?.error?.message || 'Unknown error'}`);
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
const data = await res.json();
|
|
1015
|
+
const content = data?.choices?.[0]?.message?.content?.trim() || '';
|
|
1016
|
+
|
|
1017
|
+
if (!content) {
|
|
1018
|
+
console.warn(`[${agentRole}] OpenRouter returned empty content. Model: ${model}, finish_reason: ${data?.choices?.[0]?.finish_reason}`);
|
|
1019
|
+
}
|
|
1020
|
+
|
|
1021
|
+
return {
|
|
1022
|
+
content,
|
|
1023
|
+
usage: {
|
|
1024
|
+
inputTokens: data?.usage?.prompt_tokens || 0,
|
|
1025
|
+
outputTokens: data?.usage?.completion_tokens || 0,
|
|
1026
|
+
},
|
|
1027
|
+
latencyMs: Date.now() - startTime,
|
|
1028
|
+
};
|
|
1029
|
+
}
|
|
1030
|
+
|
|
1031
|
+
// --- Gemini ---
|
|
1032
|
+
if (provider === 'gemini') {
|
|
1033
|
+
const { GoogleGenAI } = await import('@google/genai');
|
|
1034
|
+
const gemini = new GoogleGenAI({ apiKey: providerConfig.apiKey });
|
|
1035
|
+
|
|
1036
|
+
const result = await gemini.models.generateContent({
|
|
1037
|
+
model,
|
|
1038
|
+
systemInstruction: systemPrompt,
|
|
1039
|
+
contents: [{ role: 'user', parts: [{ text: userPrompt }] }],
|
|
1040
|
+
config: { temperature, maxOutputTokens: max_tokens, topP: top_p },
|
|
1041
|
+
});
|
|
1042
|
+
|
|
1043
|
+
const content = result?.text?.() || result?.response?.text?.() || '';
|
|
1044
|
+
return {
|
|
1045
|
+
content,
|
|
1046
|
+
usage: { inputTokens: 0, outputTokens: 0 },
|
|
1047
|
+
latencyMs: Date.now() - startTime,
|
|
1048
|
+
};
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
// --- Local (LM Studio / Ollama / llama.cpp) ---
|
|
1052
|
+
if (provider === 'local') {
|
|
1053
|
+
const res = await fetch(providerConfig.base_url, {
|
|
1054
|
+
method: 'POST',
|
|
1055
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1056
|
+
body: JSON.stringify({
|
|
1057
|
+
model,
|
|
1058
|
+
temperature,
|
|
1059
|
+
max_tokens,
|
|
1060
|
+
messages: [
|
|
1061
|
+
{ role: 'system', content: systemPrompt },
|
|
1062
|
+
{ role: 'user', content: userPrompt },
|
|
1063
|
+
],
|
|
1064
|
+
}),
|
|
1065
|
+
});
|
|
1066
|
+
|
|
1067
|
+
if (!res.ok) {
|
|
1068
|
+
const data = await res.json().catch(() => ({}));
|
|
1069
|
+
throw new Error(`Local LLM error: ${res.status} - ${data?.error?.message || 'Is LM Studio running?'}`);
|
|
1070
|
+
}
|
|
1071
|
+
|
|
1072
|
+
const data = await res.json();
|
|
1073
|
+
return {
|
|
1074
|
+
content: data?.choices?.[0]?.message?.content?.trim() || '',
|
|
1075
|
+
usage: {
|
|
1076
|
+
inputTokens: data?.usage?.prompt_tokens || 0,
|
|
1077
|
+
outputTokens: data?.usage?.completion_tokens || 0,
|
|
1078
|
+
},
|
|
1079
|
+
latencyMs: Date.now() - startTime,
|
|
1080
|
+
};
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
throw new Error(`Unsupported learner provider: ${provider}`);
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
/**
|
|
1087
|
+
* Generate a single learner response for use by the evaluation pipeline.
|
|
1088
|
+
* Runs ego→superego→synthesis if profile is multi-agent, or single call if unified.
|
|
1089
|
+
*
|
|
1090
|
+
* Uses callLearnerAI internally — the same raw fetch layer as the tutor's
|
|
1091
|
+
* tutorDialogueEngine.callAI — so learner and tutor LLM calls go through
|
|
1092
|
+
* identical provider code paths with identical retry logic.
|
|
1093
|
+
*
|
|
1094
|
+
* @param {Object} options
|
|
1095
|
+
* @param {string} options.tutorMessage - The tutor's message to respond to
|
|
1096
|
+
* @param {string} options.topic - Current topic
|
|
1097
|
+
* @param {Array} options.conversationHistory - [{role, content}, ...]
|
|
1098
|
+
* @param {string} options.learnerProfile - Profile name ('ego_superego' or 'unified')
|
|
1099
|
+
* @param {string} options.personaId - Persona identifier (default: 'eager_novice')
|
|
1100
|
+
* @param {string|Object} [options.modelOverride] - Optional model override (e.g. 'openrouter.nemotron') applied to all learner agents
|
|
1101
|
+
* @returns {Promise<Object>} { message, internalDeliberation, emotionalState, understandingLevel, tokenUsage }
|
|
1102
|
+
*/
|
|
1103
|
+
export async function generateLearnerResponse(options) {
|
|
1104
|
+
const {
|
|
1105
|
+
tutorMessage,
|
|
1106
|
+
topic,
|
|
1107
|
+
conversationHistory = [],
|
|
1108
|
+
learnerProfile = 'unified',
|
|
1109
|
+
personaId = 'eager_novice',
|
|
1110
|
+
modelOverride,
|
|
1111
|
+
} = options;
|
|
1112
|
+
|
|
1113
|
+
// Resolve model override once (if provided) so all learner agents use the same model
|
|
1114
|
+
let resolvedOverride = null;
|
|
1115
|
+
if (modelOverride) {
|
|
1116
|
+
const r = learnerConfig.resolveModel(modelOverride);
|
|
1117
|
+
const providerConfig = learnerConfig.getProviderConfig(r.provider);
|
|
1118
|
+
const modelFullId = providerConfig.models?.[r.model] || r.model;
|
|
1119
|
+
resolvedOverride = { provider: r.provider, providerConfig, model: modelFullId, modelAlias: r.model };
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
const applyOverride = (cfg) => {
|
|
1123
|
+
if (!resolvedOverride || !cfg) return cfg;
|
|
1124
|
+
return { ...cfg, provider: resolvedOverride.provider, providerConfig: resolvedOverride.providerConfig, model: resolvedOverride.model, modelAlias: resolvedOverride.modelAlias };
|
|
1125
|
+
};
|
|
1126
|
+
|
|
1127
|
+
const persona = learnerConfig.getPersona(personaId);
|
|
1128
|
+
const profile = learnerConfig.getActiveProfile(learnerProfile);
|
|
1129
|
+
const agentRoles = learnerConfig.getProfileAgentRoles(profile.name);
|
|
1130
|
+
const internalDeliberation = [];
|
|
1131
|
+
const tokenUsage = { inputTokens: 0, outputTokens: 0, apiCalls: 0 };
|
|
1132
|
+
|
|
1133
|
+
// Build conversation context string from history
|
|
1134
|
+
const conversationContext = conversationHistory
|
|
1135
|
+
.slice(-6)
|
|
1136
|
+
.map(m => `${m.role.toUpperCase()}: ${m.content}`)
|
|
1137
|
+
.join('\n\n');
|
|
1138
|
+
|
|
1139
|
+
// Psychodynamic flow: Ego (initial) → Superego (critique) → Ego (revision/final)
|
|
1140
|
+
// This mirrors the tutor architecture where the ego has final authority over output,
|
|
1141
|
+
// accepting, rejecting, or modifying the superego's suggestions.
|
|
1142
|
+
|
|
1143
|
+
const hasMultiAgent = agentRoles.includes('ego') && agentRoles.includes('superego');
|
|
1144
|
+
|
|
1145
|
+
if (hasMultiAgent) {
|
|
1146
|
+
// === STEP 1: Ego initial reaction ===
|
|
1147
|
+
const egoConfig = applyOverride(learnerConfig.getAgentConfig('ego', profile.name));
|
|
1148
|
+
const egoContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"\n\nGenerate your initial internal reaction as the learner's ego.`;
|
|
1149
|
+
const egoSystemPrompt = buildLearnerPrompt(egoConfig, persona, egoContext);
|
|
1150
|
+
|
|
1151
|
+
const egoInitialResponse = await callLearnerAI(egoConfig, egoSystemPrompt, "React to the tutor's message.", 'learner_ego_initial');
|
|
1152
|
+
internalDeliberation.push({ role: 'ego_initial', content: egoInitialResponse.content });
|
|
1153
|
+
tokenUsage.inputTokens += egoInitialResponse.usage?.inputTokens || 0;
|
|
1154
|
+
tokenUsage.outputTokens += egoInitialResponse.usage?.outputTokens || 0;
|
|
1155
|
+
tokenUsage.apiCalls++;
|
|
1156
|
+
|
|
1157
|
+
// === STEP 2: Superego critique ===
|
|
1158
|
+
const superegoConfig = applyOverride(learnerConfig.getAgentConfig('superego', profile.name));
|
|
1159
|
+
const superegoContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"\n\nThe EGO's initial reaction was:\n"${egoInitialResponse.content}"\n\nReview the EGO's response. Is it accurate? What's being missed? What should be reconsidered?`;
|
|
1160
|
+
const superegoSystemPrompt = buildLearnerPrompt(superegoConfig, persona, superegoContext);
|
|
1161
|
+
|
|
1162
|
+
const superegoResponse = await callLearnerAI(superegoConfig, superegoSystemPrompt, "Critique the EGO's reaction.", 'learner_superego');
|
|
1163
|
+
internalDeliberation.push({ role: 'superego', content: superegoResponse.content });
|
|
1164
|
+
tokenUsage.inputTokens += superegoResponse.usage?.inputTokens || 0;
|
|
1165
|
+
tokenUsage.outputTokens += superegoResponse.usage?.outputTokens || 0;
|
|
1166
|
+
tokenUsage.apiCalls++;
|
|
1167
|
+
|
|
1168
|
+
// === STEP 3: Ego revision (final authority) ===
|
|
1169
|
+
// The ego considers the superego's feedback and decides what to actually say.
|
|
1170
|
+
// It may accept, reject, or modify the superego's suggestions.
|
|
1171
|
+
const egoRevisionContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"\n\nYour initial reaction was:\n"${egoInitialResponse.content}"\n\nThe SUPEREGO's critique:\n"${superegoResponse.content}"\n\nConsider the superego's feedback. You have final authority — accept, reject, or modify its suggestions as you see fit. Then produce a realistic external response (1-4 sentences) that the learner would actually say to the tutor.`;
|
|
1172
|
+
const egoRevisionSystemPrompt = buildLearnerPrompt(egoConfig, persona, egoRevisionContext);
|
|
1173
|
+
|
|
1174
|
+
const egoFinalResponse = await callLearnerAI(egoConfig, egoRevisionSystemPrompt, "Produce your final response to the tutor.", 'learner_ego_revision');
|
|
1175
|
+
internalDeliberation.push({ role: 'ego_revision', content: egoFinalResponse.content });
|
|
1176
|
+
tokenUsage.inputTokens += egoFinalResponse.usage?.inputTokens || 0;
|
|
1177
|
+
tokenUsage.outputTokens += egoFinalResponse.usage?.outputTokens || 0;
|
|
1178
|
+
tokenUsage.apiCalls++;
|
|
1179
|
+
|
|
1180
|
+
// Log deliberation for debugging/analysis
|
|
1181
|
+
if (process.env.LEARNER_DEBUG) {
|
|
1182
|
+
console.log('\n┌─────────────────────────────────────────────────────────────');
|
|
1183
|
+
console.log('│ LEARNER DELIBERATION (ego→superego→ego_revision)');
|
|
1184
|
+
console.log('├─────────────────────────────────────────────────────────────');
|
|
1185
|
+
console.log(`│ EGO INITIAL: ${egoInitialResponse.content.substring(0, 200)}...`);
|
|
1186
|
+
console.log('├─────────────────────────────────────────────────────────────');
|
|
1187
|
+
console.log(`│ SUPEREGO: ${superegoResponse.content.substring(0, 200)}...`);
|
|
1188
|
+
console.log('├─────────────────────────────────────────────────────────────');
|
|
1189
|
+
console.log(`│ EGO REVISION (FINAL): ${egoFinalResponse.content.substring(0, 200)}...`);
|
|
1190
|
+
console.log('└─────────────────────────────────────────────────────────────\n');
|
|
1191
|
+
}
|
|
1192
|
+
} else {
|
|
1193
|
+
// Single-agent (unified) flow — run each role sequentially as before
|
|
1194
|
+
for (const role of agentRoles) {
|
|
1195
|
+
const agentConfig = applyOverride(learnerConfig.getAgentConfig(role, profile.name));
|
|
1196
|
+
if (!agentConfig) continue;
|
|
1197
|
+
|
|
1198
|
+
let roleContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"`;
|
|
1199
|
+
roleContext += `\n\nGenerate your internal reaction as this dimension of the learner's experience.`;
|
|
1200
|
+
|
|
1201
|
+
const systemPrompt = buildLearnerPrompt(agentConfig, persona, roleContext);
|
|
1202
|
+
const response = await callLearnerAI(agentConfig, systemPrompt, "React to the tutor's message.", `learner_${role}`);
|
|
1203
|
+
|
|
1204
|
+
internalDeliberation.push({ role, content: response.content });
|
|
1205
|
+
tokenUsage.inputTokens += response.usage?.inputTokens || 0;
|
|
1206
|
+
tokenUsage.outputTokens += response.usage?.outputTokens || 0;
|
|
1207
|
+
tokenUsage.apiCalls++;
|
|
1208
|
+
}
|
|
1209
|
+
}
|
|
1210
|
+
|
|
1211
|
+
// Get final message from the last deliberation step
|
|
1212
|
+
// For multi-agent: ego_revision. For unified: the single agent's output.
|
|
1213
|
+
const finalDeliberation = internalDeliberation[internalDeliberation.length - 1];
|
|
1214
|
+
|
|
1215
|
+
return {
|
|
1216
|
+
message: finalDeliberation.content,
|
|
1217
|
+
internalDeliberation,
|
|
1218
|
+
emotionalState: detectEmotionalState(internalDeliberation),
|
|
1219
|
+
understandingLevel: detectUnderstandingLevel(internalDeliberation),
|
|
1220
|
+
tokenUsage,
|
|
1221
|
+
};
|
|
1222
|
+
}
|
|
1223
|
+
|
|
850
1224
|
// ============================================================================
|
|
851
1225
|
// Exports
|
|
852
1226
|
// ============================================================================
|
|
853
1227
|
|
|
854
1228
|
export default {
|
|
855
1229
|
runInteraction,
|
|
1230
|
+
generateLearnerResponse,
|
|
856
1231
|
INTERACTION_OUTCOMES,
|
|
857
1232
|
};
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Process utility functions shared across eval services and CLI.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Check if a process with the given PID is still running.
|
|
7
|
+
* @param {number} pid - Process ID to check
|
|
8
|
+
* @returns {boolean|null} true if alive, false if dead, null if pid is falsy
|
|
9
|
+
*/
|
|
10
|
+
export function isPidAlive(pid) {
|
|
11
|
+
if (!pid || typeof pid !== 'number') return null;
|
|
12
|
+
try {
|
|
13
|
+
process.kill(pid, 0); // Signal 0 = check existence without killing
|
|
14
|
+
return true;
|
|
15
|
+
} catch (e) {
|
|
16
|
+
return e.code === 'EPERM'; // EPERM means process exists but we can't signal it
|
|
17
|
+
}
|
|
18
|
+
}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Progress Logger — JSONL event writer for cross-process eval monitoring.
|
|
3
|
+
*
|
|
4
|
+
* One file per run at logs/eval-progress/<runId>.jsonl.
|
|
5
|
+
* Each line is a self-contained JSON object with timestamp + runId + eventType.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import fs from 'fs';
|
|
9
|
+
import path from 'path';
|
|
10
|
+
import { fileURLToPath } from 'url';
|
|
11
|
+
|
|
12
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
13
|
+
const __dirname = path.dirname(__filename);
|
|
14
|
+
const ROOT_DIR = path.resolve(__dirname, '..');
|
|
15
|
+
const PROGRESS_DIR = path.join(ROOT_DIR, 'logs', 'eval-progress');
|
|
16
|
+
|
|
17
|
+
export class ProgressLogger {
|
|
18
|
+
constructor(runId) {
|
|
19
|
+
this.runId = runId;
|
|
20
|
+
// Ensure directory exists
|
|
21
|
+
fs.mkdirSync(PROGRESS_DIR, { recursive: true });
|
|
22
|
+
this.filePath = path.join(PROGRESS_DIR, `${runId}.jsonl`);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/** Append a single JSON line */
|
|
26
|
+
writeEvent(eventType, data = {}) {
|
|
27
|
+
const event = {
|
|
28
|
+
timestamp: new Date().toISOString(),
|
|
29
|
+
runId: this.runId,
|
|
30
|
+
eventType,
|
|
31
|
+
...data,
|
|
32
|
+
};
|
|
33
|
+
fs.appendFileSync(this.filePath, JSON.stringify(event) + '\n');
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// ── Convenience methods ──────────────────────────────────────────
|
|
37
|
+
|
|
38
|
+
runStart({ totalTests, totalScenarios, totalConfigurations, scenarios, profiles, description }) {
|
|
39
|
+
this.writeEvent('run_start', {
|
|
40
|
+
totalTests,
|
|
41
|
+
totalScenarios,
|
|
42
|
+
totalConfigurations,
|
|
43
|
+
scenarios,
|
|
44
|
+
profiles,
|
|
45
|
+
description,
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
testStart({ scenarioId, scenarioName, profileName }) {
|
|
50
|
+
this.writeEvent('test_start', { scenarioId, scenarioName, profileName });
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
testComplete({ scenarioId, scenarioName, profileName, success, overallScore, baseScore, recognitionScore, latencyMs, completedCount, totalTests }) {
|
|
54
|
+
this.writeEvent('test_complete', {
|
|
55
|
+
scenarioId, scenarioName, profileName,
|
|
56
|
+
success, overallScore, baseScore, recognitionScore, latencyMs,
|
|
57
|
+
completedCount, totalTests,
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
testError({ scenarioId, scenarioName, profileName, errorMessage, completedCount, totalTests }) {
|
|
62
|
+
this.writeEvent('test_error', {
|
|
63
|
+
scenarioId, scenarioName, profileName, errorMessage,
|
|
64
|
+
completedCount, totalTests,
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
scenarioComplete({ scenarioId, scenarioName, profileNames, avgScore, completedScenarios, totalScenarios }) {
|
|
69
|
+
this.writeEvent('scenario_complete', {
|
|
70
|
+
scenarioId, scenarioName, profileNames, avgScore,
|
|
71
|
+
completedScenarios, totalScenarios,
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
runComplete({ totalTests, successfulTests, failedTests, durationMs }) {
|
|
76
|
+
this.writeEvent('run_complete', {
|
|
77
|
+
totalTests, successfulTests, failedTests, durationMs,
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/** Resolve the JSONL path for a given runId (may not exist yet). */
|
|
83
|
+
export function getProgressLogPath(runId) {
|
|
84
|
+
return path.join(PROGRESS_DIR, `${runId}.jsonl`);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/** Read all events from a JSONL progress file. Returns [] if missing. */
|
|
88
|
+
export function readProgressLog(runId) {
|
|
89
|
+
const filePath = path.join(PROGRESS_DIR, `${runId}.jsonl`);
|
|
90
|
+
if (!fs.existsSync(filePath)) return [];
|
|
91
|
+
const lines = fs.readFileSync(filePath, 'utf-8').split('\n').filter(Boolean);
|
|
92
|
+
return lines.map(line => {
|
|
93
|
+
try { return JSON.parse(line); }
|
|
94
|
+
catch { return null; }
|
|
95
|
+
}).filter(Boolean);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
export default { ProgressLogger, getProgressLogPath, readProgressLog };
|
|
@@ -2,18 +2,17 @@
|
|
|
2
2
|
* Prompt Recommendation Service
|
|
3
3
|
*
|
|
4
4
|
* Analyzes evaluation results and generates recommendations to improve
|
|
5
|
-
* tutor prompts. Uses a powerful
|
|
5
|
+
* tutor prompts. Uses a powerful recommender model to analyze failures
|
|
6
6
|
* and weaknesses from weaker tutor models.
|
|
7
7
|
*
|
|
8
|
-
*
|
|
8
|
+
* Recommender configuration is loaded from config/evaluation-rubric.yaml
|
|
9
9
|
* Provider details are resolved from config/providers.yaml
|
|
10
10
|
*/
|
|
11
11
|
|
|
12
12
|
import fs from 'fs';
|
|
13
13
|
import path from 'path';
|
|
14
14
|
import { fileURLToPath } from 'url';
|
|
15
|
-
import
|
|
16
|
-
import { tutorApiService as tutorApi, tutorConfigLoader as configLoader } from '@machinespirits/tutor-core';
|
|
15
|
+
import * as evalConfigLoader from './evalConfigLoader.js';
|
|
17
16
|
|
|
18
17
|
const __filename = fileURLToPath(import.meta.url);
|
|
19
18
|
const __dirname = path.dirname(__filename);
|
|
@@ -22,12 +21,11 @@ const PROMPTS_DIR = path.join(ROOT_DIR, 'prompts');
|
|
|
22
21
|
|
|
23
22
|
/**
|
|
24
23
|
* Get recommender config, resolving model references via providers.yaml
|
|
25
|
-
* Uses 'recommender' config from evaluation-rubric.yaml
|
|
24
|
+
* Uses 'recommender' config from evaluation-rubric.yaml
|
|
26
25
|
*/
|
|
27
|
-
function
|
|
28
|
-
const rubric =
|
|
29
|
-
|
|
30
|
-
const evalConfig = rubric?.recommender || rubric?.evaluator;
|
|
26
|
+
function getRecommenderConfig() {
|
|
27
|
+
const rubric = evalConfigLoader.loadRubric();
|
|
28
|
+
const evalConfig = rubric?.recommender;
|
|
31
29
|
|
|
32
30
|
if (!evalConfig?.model) {
|
|
33
31
|
console.warn('[promptRecommendation] No recommender in evaluation-rubric.yaml, using defaults');
|
|
@@ -40,7 +38,7 @@ function getEvaluatorConfig() {
|
|
|
40
38
|
|
|
41
39
|
// Try to resolve primary model
|
|
42
40
|
try {
|
|
43
|
-
const resolved =
|
|
41
|
+
const resolved = evalConfigLoader.resolveModel(evalConfig.model);
|
|
44
42
|
if (resolved.isConfigured) {
|
|
45
43
|
return {
|
|
46
44
|
provider: resolved.provider,
|
|
@@ -57,7 +55,7 @@ function getEvaluatorConfig() {
|
|
|
57
55
|
// Try fallback
|
|
58
56
|
if (evalConfig.fallback?.model) {
|
|
59
57
|
try {
|
|
60
|
-
const fallback =
|
|
58
|
+
const fallback = evalConfigLoader.resolveModel(evalConfig.fallback.model);
|
|
61
59
|
if (fallback.isConfigured) {
|
|
62
60
|
console.log(`[promptRecommendation] Using fallback: ${fallback.provider}/${fallback.model}`);
|
|
63
61
|
return {
|
|
@@ -74,7 +72,7 @@ function getEvaluatorConfig() {
|
|
|
74
72
|
}
|
|
75
73
|
|
|
76
74
|
// Return primary anyway - will fail with helpful error
|
|
77
|
-
const resolved =
|
|
75
|
+
const resolved = evalConfigLoader.resolveModel(evalConfig.model);
|
|
78
76
|
return {
|
|
79
77
|
provider: resolved.provider,
|
|
80
78
|
model: resolved.model,
|
|
@@ -157,7 +155,7 @@ function analyzeResults(results) {
|
|
|
157
155
|
}
|
|
158
156
|
|
|
159
157
|
/**
|
|
160
|
-
* Build the analysis prompt for the
|
|
158
|
+
* Build the analysis prompt for the recommender
|
|
161
159
|
*/
|
|
162
160
|
function buildAnalysisPrompt(analysis, egoPrompt, superegoPrompt, profileName) {
|
|
163
161
|
const sections = [];
|
|
@@ -273,14 +271,14 @@ Be specific and actionable. Quote exact text to change when possible.
|
|
|
273
271
|
}
|
|
274
272
|
|
|
275
273
|
/**
|
|
276
|
-
* Call the
|
|
274
|
+
* Call the recommender model to generate recommendations
|
|
277
275
|
* Uses config from evaluation-rubric.yaml
|
|
278
276
|
*/
|
|
279
|
-
async function
|
|
277
|
+
async function callRecommender(prompt, options = {}) {
|
|
280
278
|
const { budget = false } = options;
|
|
281
279
|
|
|
282
280
|
// Get config from yaml (handles fallbacks automatically)
|
|
283
|
-
const config =
|
|
281
|
+
const config = getRecommenderConfig();
|
|
284
282
|
const { provider, model, hyperparameters } = config;
|
|
285
283
|
const maxTokens = hyperparameters?.max_tokens ?? 4000;
|
|
286
284
|
const temperature = hyperparameters?.temperature ?? 0.3;
|
|
@@ -298,6 +296,13 @@ async function callEvaluator(prompt, options = {}) {
|
|
|
298
296
|
throw new Error('ANTHROPIC_API_KEY not set');
|
|
299
297
|
}
|
|
300
298
|
|
|
299
|
+
let Anthropic;
|
|
300
|
+
try {
|
|
301
|
+
Anthropic = (await import('@anthropic-ai/sdk')).default;
|
|
302
|
+
} catch {
|
|
303
|
+
throw new Error('@anthropic-ai/sdk is not installed. Install it to use the Anthropic provider for recommendations.');
|
|
304
|
+
}
|
|
305
|
+
|
|
301
306
|
const client = new Anthropic({ apiKey });
|
|
302
307
|
|
|
303
308
|
const response = await client.messages.create({
|
|
@@ -377,9 +382,9 @@ async function callOpenRouterEvaluator(prompt, model, options = {}) {
|
|
|
377
382
|
* @param {string} options.profileName - Profile that was evaluated
|
|
378
383
|
* @param {string} options.egoPromptFile - Ego prompt file to analyze
|
|
379
384
|
* @param {string} options.superegoPromptFile - Superego prompt file to analyze
|
|
380
|
-
* @param {string} options.
|
|
381
|
-
* @param {string} options.
|
|
382
|
-
* @param {boolean} options.budget - Use budget
|
|
385
|
+
* @param {string} options.recommenderModel - Model to use for analysis (default: claude-sonnet-4)
|
|
386
|
+
* @param {string} options.recommenderProvider - Provider: 'anthropic' or 'openrouter'
|
|
387
|
+
* @param {boolean} options.budget - Use budget recommender model
|
|
383
388
|
* @returns {Promise<Object>} Recommendations
|
|
384
389
|
*/
|
|
385
390
|
export async function generateRecommendations(options = {}) {
|
|
@@ -388,8 +393,8 @@ export async function generateRecommendations(options = {}) {
|
|
|
388
393
|
profileName = 'unknown',
|
|
389
394
|
egoPromptFile = 'tutor-ego.md',
|
|
390
395
|
superegoPromptFile = 'tutor-superego.md',
|
|
391
|
-
|
|
392
|
-
|
|
396
|
+
recommenderModel = null,
|
|
397
|
+
recommenderProvider = 'anthropic',
|
|
393
398
|
budget = false,
|
|
394
399
|
} = options;
|
|
395
400
|
|
|
@@ -421,18 +426,18 @@ export async function generateRecommendations(options = {}) {
|
|
|
421
426
|
// Build analysis prompt
|
|
422
427
|
const analysisPrompt = buildAnalysisPrompt(analysis, egoPrompt, superegoPrompt, profileName);
|
|
423
428
|
|
|
424
|
-
// Get
|
|
425
|
-
const evalConfig =
|
|
429
|
+
// Get recommender config from yaml
|
|
430
|
+
const evalConfig = getRecommenderConfig();
|
|
426
431
|
console.log(`\nGenerating recommendations using ${evalConfig.provider}/${evalConfig.model}...`);
|
|
427
432
|
|
|
428
|
-
const evalResult = await
|
|
433
|
+
const evalResult = await callRecommender(analysisPrompt);
|
|
429
434
|
|
|
430
435
|
return {
|
|
431
436
|
success: true,
|
|
432
437
|
needsImprovement: true,
|
|
433
438
|
analysis,
|
|
434
439
|
recommendations: evalResult.content,
|
|
435
|
-
|
|
440
|
+
recommenderModel: evalResult.model,
|
|
436
441
|
usage: {
|
|
437
442
|
inputTokens: evalResult.inputTokens,
|
|
438
443
|
outputTokens: evalResult.outputTokens,
|
|
@@ -479,7 +484,7 @@ export function formatRecommendations(result) {
|
|
|
479
484
|
lines.push(result.recommendations);
|
|
480
485
|
lines.push('');
|
|
481
486
|
lines.push('─'.repeat(80));
|
|
482
|
-
lines.push(`
|
|
487
|
+
lines.push(`Recommender: ${result.recommenderModel}`);
|
|
483
488
|
lines.push(`Tokens: ${result.usage.inputTokens} in / ${result.usage.outputTokens} out`);
|
|
484
489
|
lines.push('═'.repeat(80));
|
|
485
490
|
|