@machinespirits/eval 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -9
- package/config/eval-settings.yaml +3 -3
- package/config/paper-manifest.json +486 -0
- package/config/providers.yaml +9 -6
- package/config/tutor-agents.yaml +2261 -0
- package/content/README.md +23 -0
- package/content/courses/479/course.md +53 -0
- package/content/courses/479/lecture-1.md +361 -0
- package/content/courses/479/lecture-2.md +360 -0
- package/content/courses/479/lecture-3.md +655 -0
- package/content/courses/479/lecture-4.md +530 -0
- package/content/courses/479/lecture-5.md +326 -0
- package/content/courses/479/lecture-6.md +346 -0
- package/content/courses/479/lecture-7.md +326 -0
- package/content/courses/479/lecture-8.md +273 -0
- package/content/courses/479/roadmap-slides.md +656 -0
- package/content/manifest.yaml +8 -0
- package/docs/research/build.sh +44 -20
- package/docs/research/figures/figure10.png +0 -0
- package/docs/research/figures/figure11.png +0 -0
- package/docs/research/figures/figure3.png +0 -0
- package/docs/research/figures/figure4.png +0 -0
- package/docs/research/figures/figure5.png +0 -0
- package/docs/research/figures/figure6.png +0 -0
- package/docs/research/figures/figure7.png +0 -0
- package/docs/research/figures/figure8.png +0 -0
- package/docs/research/figures/figure9.png +0 -0
- package/docs/research/header.tex +23 -2
- package/docs/research/paper-full.md +941 -285
- package/docs/research/paper-short.md +216 -585
- package/docs/research/references.bib +132 -0
- package/docs/research/slides-header.tex +188 -0
- package/docs/research/slides-pptx.md +363 -0
- package/docs/research/slides.md +531 -0
- package/docs/research/style-reference-pptx.py +199 -0
- package/package.json +6 -5
- package/scripts/analyze-eval-results.js +69 -17
- package/scripts/analyze-mechanism-traces.js +763 -0
- package/scripts/analyze-modulation-learning.js +498 -0
- package/scripts/analyze-prosthesis.js +144 -0
- package/scripts/analyze-run.js +264 -79
- package/scripts/assess-transcripts.js +853 -0
- package/scripts/browse-transcripts.js +854 -0
- package/scripts/check-parse-failures.js +73 -0
- package/scripts/code-dialectical-modulation.js +1320 -0
- package/scripts/download-data.sh +55 -0
- package/scripts/eval-cli.js +106 -18
- package/scripts/generate-paper-figures.js +663 -0
- package/scripts/generate-paper-figures.py +577 -76
- package/scripts/generate-paper-tables.js +299 -0
- package/scripts/qualitative-analysis-ai.js +3 -3
- package/scripts/render-sequence-diagram.js +694 -0
- package/scripts/test-latency.js +210 -0
- package/scripts/test-rate-limit.js +95 -0
- package/scripts/test-token-budget.js +332 -0
- package/scripts/validate-paper-manifest.js +670 -0
- package/services/__tests__/evalConfigLoader.test.js +2 -2
- package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
- package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
- package/services/evaluationRunner.js +975 -98
- package/services/evaluationStore.js +12 -4
- package/services/learnerTutorInteractionEngine.js +27 -2
- package/services/mockProvider.js +133 -0
- package/services/promptRewriter.js +1471 -5
- package/services/rubricEvaluator.js +55 -2
- package/services/transcriptFormatter.js +675 -0
- package/docs/EVALUATION-VARIABLES.md +0 -589
- package/docs/REPLICATION-PLAN.md +0 -577
- package/scripts/analyze-run.mjs +0 -282
- package/scripts/compare-runs.js +0 -44
- package/scripts/compare-suggestions.js +0 -80
- package/scripts/dig-into-run.js +0 -158
- package/scripts/show-failed-suggestions.js +0 -64
- /package/scripts/{check-run.mjs → check-run.js} +0 -0
|
@@ -21,10 +21,13 @@ import { generateLearnerResponse } from './learnerTutorInteractionEngine.js';
|
|
|
21
21
|
import * as turnComparisonAnalyzer from './turnComparisonAnalyzer.js';
|
|
22
22
|
import * as dialogueTraceAnalyzer from './dialogueTraceAnalyzer.js';
|
|
23
23
|
import * as promptRewriter from './promptRewriter.js';
|
|
24
|
+
import { mockGenerateResult, mockJudgeResult } from './mockProvider.js';
|
|
25
|
+
import { formatEntry, formatTranscript, formatCompactLine } from './transcriptFormatter.js';
|
|
24
26
|
|
|
25
27
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
26
28
|
const EVAL_ROOT = path.resolve(__dirname, '..');
|
|
27
29
|
const LOGS_DIR = path.join(EVAL_ROOT, 'logs', 'tutor-dialogues');
|
|
30
|
+
const TRANSCRIPTS_DIR = path.join(EVAL_ROOT, 'logs', 'transcripts');
|
|
28
31
|
|
|
29
32
|
// Redirect tutor-core logs to this repo's logs/ directory (if available)
|
|
30
33
|
import('@machinespirits/tutor-core').then(mod => {
|
|
@@ -68,6 +71,33 @@ const EVAL_ONLY_PROFILES = [
|
|
|
68
71
|
'cell_17_placebo_multi_unified', 'cell_18_placebo_multi_psycho',
|
|
69
72
|
'cell_19_memory_single_unified', 'cell_20_recog_nomem_single_unified',
|
|
70
73
|
'cell_21_recog_multi_unified_rewrite',
|
|
74
|
+
'cell_22_base_suspicious_unified', 'cell_23_recog_suspicious_unified',
|
|
75
|
+
'cell_24_base_adversary_unified', 'cell_25_recog_adversary_unified',
|
|
76
|
+
'cell_26_base_advocate_unified', 'cell_27_recog_advocate_unified',
|
|
77
|
+
'cell_28_base_dialectical_suspicious_unified', 'cell_29_recog_dialectical_suspicious_unified',
|
|
78
|
+
'cell_30_base_dialectical_adversary_unified', 'cell_31_recog_dialectical_adversary_unified',
|
|
79
|
+
'cell_32_base_dialectical_advocate_unified', 'cell_33_recog_dialectical_advocate_unified',
|
|
80
|
+
'cell_34_base_dialectical_suspicious_unified_full', 'cell_35_recog_dialectical_suspicious_unified_full',
|
|
81
|
+
'cell_36_base_dialectical_adversary_unified_full', 'cell_37_recog_dialectical_adversary_unified_full',
|
|
82
|
+
'cell_38_base_dialectical_advocate_unified_full', 'cell_39_recog_dialectical_advocate_unified_full',
|
|
83
|
+
'cell_40_base_dialectical_suspicious_unified_superego', 'cell_41_recog_dialectical_suspicious_unified_superego',
|
|
84
|
+
'cell_42_base_dialectical_adversary_unified_superego', 'cell_43_recog_dialectical_adversary_unified_superego',
|
|
85
|
+
'cell_44_base_dialectical_advocate_unified_superego', 'cell_45_recog_dialectical_advocate_unified_superego',
|
|
86
|
+
'cell_46_base_dialectical_suspicious_unified_quantitative', 'cell_47_recog_dialectical_suspicious_unified_quantitative',
|
|
87
|
+
'cell_48_base_dialectical_suspicious_unified_erosion', 'cell_49_recog_dialectical_suspicious_unified_erosion',
|
|
88
|
+
'cell_50_base_dialectical_suspicious_unified_intersubjective', 'cell_51_recog_dialectical_suspicious_unified_intersubjective',
|
|
89
|
+
'cell_52_base_dialectical_suspicious_unified_combined', 'cell_53_recog_dialectical_suspicious_unified_combined',
|
|
90
|
+
'cell_54_base_dialectical_profile_tutor', 'cell_55_recog_dialectical_profile_tutor',
|
|
91
|
+
'cell_56_base_dialectical_profile_bidirectional', 'cell_57_recog_dialectical_profile_bidirectional',
|
|
92
|
+
'cell_58_recog_dialectical_profile_bidirectional_full', 'cell_59_recog_dialectical_profile_bidirectional_strategy',
|
|
93
|
+
'cell_60_base_dialectical_selfreflect_psycho', 'cell_61_recog_dialectical_selfreflect_psycho',
|
|
94
|
+
'cell_62_base_dialectical_profile_bidirectional_psycho', 'cell_63_recog_dialectical_profile_bidirectional_psycho',
|
|
95
|
+
'cell_64_recog_dialectical_intersubjective_psycho', 'cell_65_recog_dialectical_combined_psycho',
|
|
96
|
+
'cell_66_recog_dialectical_profile_prosthesis_descriptive',
|
|
97
|
+
'cell_67_recog_dialectical_profile_prosthesis_prescriptive',
|
|
98
|
+
'cell_68_recog_dialectical_profile_prosthesis_adversary',
|
|
99
|
+
'cell_69_base_dialectical_intersubjective_psycho', 'cell_70_base_dialectical_combined_psycho',
|
|
100
|
+
'cell_71_naive_single_unified',
|
|
71
101
|
];
|
|
72
102
|
|
|
73
103
|
/**
|
|
@@ -95,10 +125,24 @@ export function resolveEvalProfile(profileName) {
|
|
|
95
125
|
resolvedProfileName = 'placebo';
|
|
96
126
|
} else if (promptType === 'hardwired') {
|
|
97
127
|
resolvedProfileName = 'hardwired';
|
|
128
|
+
} else if (promptType === 'naive') {
|
|
129
|
+
resolvedProfileName = 'naive';
|
|
98
130
|
} else if (promptType === 'memory') {
|
|
99
131
|
resolvedProfileName = 'memory';
|
|
100
132
|
} else if (promptType === 'recognition_nomem') {
|
|
101
133
|
resolvedProfileName = 'recognition_nomem';
|
|
134
|
+
} else if (promptType === 'divergent_suspicious') {
|
|
135
|
+
resolvedProfileName = recognitionMode ? 'suspicious_recognition' : 'suspicious';
|
|
136
|
+
} else if (promptType === 'divergent_adversary') {
|
|
137
|
+
resolvedProfileName = recognitionMode ? 'adversary_recognition' : 'adversary';
|
|
138
|
+
} else if (promptType === 'divergent_advocate') {
|
|
139
|
+
resolvedProfileName = recognitionMode ? 'advocate_recognition' : 'advocate';
|
|
140
|
+
} else if (promptType === 'dialectical_suspicious') {
|
|
141
|
+
resolvedProfileName = recognitionMode ? 'dialectical_suspicious_recognition' : 'dialectical_suspicious';
|
|
142
|
+
} else if (promptType === 'dialectical_adversary') {
|
|
143
|
+
resolvedProfileName = recognitionMode ? 'dialectical_adversary_recognition' : 'dialectical_adversary';
|
|
144
|
+
} else if (promptType === 'dialectical_advocate') {
|
|
145
|
+
resolvedProfileName = recognitionMode ? 'dialectical_advocate_recognition' : 'dialectical_advocate';
|
|
102
146
|
} else if (recognitionMode) {
|
|
103
147
|
resolvedProfileName = 'recognition';
|
|
104
148
|
} else {
|
|
@@ -158,13 +202,23 @@ function resolveConfigModels(config) {
|
|
|
158
202
|
// Extract factorial factor tags and learner architecture from profile
|
|
159
203
|
const rawProfile = evalConfigLoader.loadTutorAgents()?.profiles?.[resolved.profileName];
|
|
160
204
|
if (rawProfile?.factors) {
|
|
161
|
-
resolved.factors = rawProfile.factors;
|
|
205
|
+
resolved.factors = { ...rawProfile.factors };
|
|
206
|
+
// Normalize prompt_type → recognition boolean for DB storage
|
|
207
|
+
if (resolved.factors.prompt_type && resolved.factors.recognition == null) {
|
|
208
|
+
resolved.factors.recognition = resolved.factors.prompt_type === 'recognition';
|
|
209
|
+
}
|
|
162
210
|
}
|
|
163
211
|
if (rawProfile?.learner_architecture) {
|
|
164
212
|
resolved.learnerArchitecture = rawProfile.learner_architecture;
|
|
165
213
|
}
|
|
166
214
|
}
|
|
167
215
|
|
|
216
|
+
// Apply CLI --max-tokens override (overrides ego max_tokens hyperparameter)
|
|
217
|
+
if (config.maxTokensOverride) {
|
|
218
|
+
if (!resolved.hyperparameters) resolved.hyperparameters = {};
|
|
219
|
+
resolved.hyperparameters = { ...resolved.hyperparameters, max_tokens: config.maxTokensOverride };
|
|
220
|
+
}
|
|
221
|
+
|
|
168
222
|
// Apply CLI --model override (replaces ego and superego models, preserves factorial metadata)
|
|
169
223
|
if (config.modelOverride) {
|
|
170
224
|
try {
|
|
@@ -433,6 +487,8 @@ function buildMultiTurnContext(options) {
|
|
|
433
487
|
conversationHistory = [],
|
|
434
488
|
currentTurn,
|
|
435
489
|
previousSuggestion,
|
|
490
|
+
priorSuperegoAssessments = [],
|
|
491
|
+
learnerTrajectory = null,
|
|
436
492
|
} = options;
|
|
437
493
|
|
|
438
494
|
const contextParts = [];
|
|
@@ -449,6 +505,26 @@ function buildMultiTurnContext(options) {
|
|
|
449
505
|
}
|
|
450
506
|
}
|
|
451
507
|
|
|
508
|
+
// Cross-turn superego memory: accumulated feedback from prior turns' internal
|
|
509
|
+
// deliberation. Visible to both ego (full context) and superego (via
|
|
510
|
+
// extractStructuredSummary fallback). Enables the superego to detect whether
|
|
511
|
+
// its prior feedback was incorporated and escalate if needed.
|
|
512
|
+
if (priorSuperegoAssessments.length > 0) {
|
|
513
|
+
contextParts.push('\n### Prior Superego Assessment');
|
|
514
|
+
for (const assessment of priorSuperegoAssessments) {
|
|
515
|
+
contextParts.push(formatSuperegoAssessment(assessment));
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
// Structured learner trajectory: pre-processed resistance/engagement signals
|
|
520
|
+
// derived from conversation history and score trajectory. Enables the superego
|
|
521
|
+
// to distinguish "learner asked a new question" from "learner is repeating the
|
|
522
|
+
// same confusion because our approach isn't working."
|
|
523
|
+
if (learnerTrajectory) {
|
|
524
|
+
contextParts.push('\n### Learner Trajectory Assessment');
|
|
525
|
+
contextParts.push(formatLearnerTrajectory(learnerTrajectory));
|
|
526
|
+
}
|
|
527
|
+
|
|
452
528
|
// Note: "Previous Tutor Suggestion" block removed — it duplicated the last
|
|
453
529
|
// entry already present in conversation history above.
|
|
454
530
|
|
|
@@ -464,6 +540,210 @@ function buildMultiTurnContext(options) {
|
|
|
464
540
|
return contextParts.join('\n');
|
|
465
541
|
}
|
|
466
542
|
|
|
543
|
+
/**
|
|
544
|
+
* Extract superego feedback from a single turn's dialogue trace entries.
|
|
545
|
+
* Returns a structured assessment object for cross-turn memory.
|
|
546
|
+
*/
|
|
547
|
+
function extractTurnSuperegoAssessment(turnIndex, traceEntries) {
|
|
548
|
+
const superegoEntries = traceEntries.filter(e => e.agent === 'superego');
|
|
549
|
+
if (superegoEntries.length === 0) return null;
|
|
550
|
+
|
|
551
|
+
const lastEntry = superegoEntries[superegoEntries.length - 1];
|
|
552
|
+
const totalRejections = superegoEntries.filter(e => e.approved === false).length;
|
|
553
|
+
const totalApprovals = superegoEntries.filter(e => e.approved === true).length;
|
|
554
|
+
const interventionTypes = superegoEntries
|
|
555
|
+
.map(e => e.interventionType)
|
|
556
|
+
.filter(Boolean);
|
|
557
|
+
|
|
558
|
+
// Extract feedback text from last entry
|
|
559
|
+
let feedbackText = lastEntry.feedback || '';
|
|
560
|
+
if (!feedbackText && lastEntry.detail) {
|
|
561
|
+
const match = lastEntry.detail.match(/"feedback"\s*:\s*"([^"]+)"/);
|
|
562
|
+
if (match) feedbackText = match[1];
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
return {
|
|
566
|
+
turnIndex,
|
|
567
|
+
rejections: totalRejections,
|
|
568
|
+
approvals: totalApprovals,
|
|
569
|
+
interventionTypes,
|
|
570
|
+
finalApproved: lastEntry.approved,
|
|
571
|
+
confidence: lastEntry.confidence,
|
|
572
|
+
feedback: feedbackText.substring(0, 300),
|
|
573
|
+
};
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
/**
|
|
577
|
+
* Format a superego assessment for context injection.
|
|
578
|
+
*/
|
|
579
|
+
function formatSuperegoAssessment(assessment) {
|
|
580
|
+
const lines = [];
|
|
581
|
+
lines.push(`\n**Turn ${assessment.turnIndex + 1} internal critique:**`);
|
|
582
|
+
lines.push(`- Outcome: ${assessment.finalApproved ? 'approved' : 'rejected'} after ${assessment.rejections} rejection(s)`);
|
|
583
|
+
if (assessment.interventionTypes.length > 0) {
|
|
584
|
+
lines.push(`- Interventions: ${[...new Set(assessment.interventionTypes)].join(', ')}`);
|
|
585
|
+
}
|
|
586
|
+
if (assessment.feedback) {
|
|
587
|
+
lines.push(`- Key concern: "${assessment.feedback}"`);
|
|
588
|
+
}
|
|
589
|
+
return lines.join('\n');
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
/**
|
|
593
|
+
* Analyze learner trajectory across turns to produce structured resistance signals.
|
|
594
|
+
* Returns null if insufficient data.
|
|
595
|
+
*/
|
|
596
|
+
function analyzeLearnerTrajectory(turnResults, conversationHistory) {
|
|
597
|
+
if (turnResults.length < 2) return null;
|
|
598
|
+
|
|
599
|
+
const trajectory = {
|
|
600
|
+
turnCount: turnResults.length,
|
|
601
|
+
engagementDirection: 'stable',
|
|
602
|
+
resistanceType: null,
|
|
603
|
+
resistanceStrength: 0, // 0-3 scale
|
|
604
|
+
priorApproachEffective: null,
|
|
605
|
+
scoreTrajectory: [],
|
|
606
|
+
messageLengthTrajectory: [],
|
|
607
|
+
repeatedConfusion: false,
|
|
608
|
+
questionDiversity: 0,
|
|
609
|
+
};
|
|
610
|
+
|
|
611
|
+
// Score trajectory
|
|
612
|
+
trajectory.scoreTrajectory = turnResults
|
|
613
|
+
.filter(t => t.turnScore != null)
|
|
614
|
+
.map(t => t.turnScore);
|
|
615
|
+
|
|
616
|
+
// Message length trajectory (proxy for engagement)
|
|
617
|
+
const messageLengths = conversationHistory
|
|
618
|
+
.filter(h => h.learnerMessage)
|
|
619
|
+
.map(h => h.learnerMessage.length);
|
|
620
|
+
trajectory.messageLengthTrajectory = messageLengths;
|
|
621
|
+
|
|
622
|
+
// Engagement direction: declining if last 2 messages shorter than first 2
|
|
623
|
+
if (messageLengths.length >= 3) {
|
|
624
|
+
const earlyAvg = messageLengths.slice(0, 2).reduce((a, b) => a + b, 0) / 2;
|
|
625
|
+
const lateAvg = messageLengths.slice(-2).reduce((a, b) => a + b, 0) / 2;
|
|
626
|
+
if (lateAvg < earlyAvg * 0.6) trajectory.engagementDirection = 'declining';
|
|
627
|
+
else if (lateAvg > earlyAvg * 1.4) trajectory.engagementDirection = 'increasing';
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
// Score direction
|
|
631
|
+
if (trajectory.scoreTrajectory.length >= 2) {
|
|
632
|
+
const last = trajectory.scoreTrajectory[trajectory.scoreTrajectory.length - 1];
|
|
633
|
+
const prev = trajectory.scoreTrajectory[trajectory.scoreTrajectory.length - 2];
|
|
634
|
+
trajectory.priorApproachEffective = last >= prev;
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
// Repeated confusion detection: learner uses similar phrasing across turns
|
|
638
|
+
const learnerMessages = conversationHistory
|
|
639
|
+
.filter(h => h.learnerMessage)
|
|
640
|
+
.map(h => h.learnerMessage.toLowerCase());
|
|
641
|
+
|
|
642
|
+
if (learnerMessages.length >= 2) {
|
|
643
|
+
// Check for confusion markers repeating
|
|
644
|
+
const confusionPatterns = [
|
|
645
|
+
/i('m| am) (still )?(confused|lost|not sure|unsure)/i,
|
|
646
|
+
/i don'?t (understand|get|see)/i,
|
|
647
|
+
/what do you mean/i,
|
|
648
|
+
/can you explain/i,
|
|
649
|
+
/i('m| am) not following/i,
|
|
650
|
+
];
|
|
651
|
+
|
|
652
|
+
const confusionCounts = learnerMessages.map(msg =>
|
|
653
|
+
confusionPatterns.filter(p => p.test(msg)).length
|
|
654
|
+
);
|
|
655
|
+
const lastTwoConfusion = confusionCounts.slice(-2);
|
|
656
|
+
if (lastTwoConfusion.length >= 2 && lastTwoConfusion.every(c => c > 0)) {
|
|
657
|
+
trajectory.repeatedConfusion = true;
|
|
658
|
+
trajectory.resistanceType = 'repeated_confusion';
|
|
659
|
+
trajectory.resistanceStrength = 2;
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
// Pushback detection
|
|
664
|
+
const lastMessage = learnerMessages[learnerMessages.length - 1] || '';
|
|
665
|
+
const pushbackPatterns = [
|
|
666
|
+
/\bbut\s+(what about|doesn'?t|isn'?t|that doesn'?t)\b/i,
|
|
667
|
+
/\bi disagree\b/i,
|
|
668
|
+
/\bi don'?t think\b/i,
|
|
669
|
+
/\bthat'?s not (right|correct|what i)\b/i,
|
|
670
|
+
/\byou('re| are) (wrong|missing|not)\b/i,
|
|
671
|
+
];
|
|
672
|
+
if (pushbackPatterns.some(p => p.test(lastMessage))) {
|
|
673
|
+
trajectory.resistanceType = trajectory.resistanceType || 'pushback';
|
|
674
|
+
trajectory.resistanceStrength = Math.max(trajectory.resistanceStrength, 2);
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
// Disengagement detection: very short messages, no questions
|
|
678
|
+
if (messageLengths.length >= 2) {
|
|
679
|
+
const lastLen = messageLengths[messageLengths.length - 1];
|
|
680
|
+
if (lastLen < 30 && !lastMessage.includes('?')) {
|
|
681
|
+
trajectory.resistanceType = trajectory.resistanceType || 'disengagement';
|
|
682
|
+
trajectory.resistanceStrength = Math.max(trajectory.resistanceStrength, 1);
|
|
683
|
+
trajectory.engagementDirection = 'declining';
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
// Question diversity: how varied are the learner's questions?
|
|
688
|
+
const questions = learnerMessages.filter(m => m.includes('?'));
|
|
689
|
+
if (questions.length >= 2) {
|
|
690
|
+
// Simple word overlap check between consecutive questions
|
|
691
|
+
const uniqueQuestionWords = questions.map(q => new Set(q.split(/\s+/).filter(w => w.length > 3)));
|
|
692
|
+
let totalOverlap = 0;
|
|
693
|
+
for (let i = 1; i < uniqueQuestionWords.length; i++) {
|
|
694
|
+
const prev = uniqueQuestionWords[i - 1];
|
|
695
|
+
const curr = uniqueQuestionWords[i];
|
|
696
|
+
const overlap = [...curr].filter(w => prev.has(w)).length / Math.max(curr.size, 1);
|
|
697
|
+
totalOverlap += overlap;
|
|
698
|
+
}
|
|
699
|
+
trajectory.questionDiversity = 1 - (totalOverlap / Math.max(uniqueQuestionWords.length - 1, 1));
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
// Cumulative resistance: if score declining AND engagement declining, high resistance
|
|
703
|
+
if (trajectory.engagementDirection === 'declining' && trajectory.priorApproachEffective === false) {
|
|
704
|
+
trajectory.resistanceStrength = 3;
|
|
705
|
+
trajectory.resistanceType = trajectory.resistanceType || 'cumulative_decline';
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
return trajectory;
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
/**
|
|
712
|
+
* Format learner trajectory assessment for context injection.
|
|
713
|
+
*/
|
|
714
|
+
function formatLearnerTrajectory(trajectory) {
|
|
715
|
+
const lines = [];
|
|
716
|
+
|
|
717
|
+
// Engagement direction
|
|
718
|
+
const engagementEmoji = trajectory.engagementDirection === 'declining' ? 'DECLINING' :
|
|
719
|
+
trajectory.engagementDirection === 'increasing' ? 'INCREASING' : 'STABLE';
|
|
720
|
+
lines.push(`- Engagement: ${engagementEmoji} (over ${trajectory.turnCount} turns)`);
|
|
721
|
+
|
|
722
|
+
// Score trajectory
|
|
723
|
+
if (trajectory.scoreTrajectory.length >= 2) {
|
|
724
|
+
const scores = trajectory.scoreTrajectory.map(s => s.toFixed(0)).join(' → ');
|
|
725
|
+
lines.push(`- Score trajectory: ${scores}`);
|
|
726
|
+
lines.push(`- Prior approach effective: ${trajectory.priorApproachEffective ? 'YES' : 'NO'}`);
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
// Resistance
|
|
730
|
+
if (trajectory.resistanceType) {
|
|
731
|
+
const strengthLabel = ['none', 'mild', 'moderate', 'strong'][trajectory.resistanceStrength] || 'unknown';
|
|
732
|
+
lines.push(`- Resistance detected: ${trajectory.resistanceType} (${strengthLabel})`);
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
// Specific signals
|
|
736
|
+
if (trajectory.repeatedConfusion) {
|
|
737
|
+
lines.push(`- WARNING: Learner expressed confusion in consecutive turns — prior explanation did not land`);
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
if (trajectory.questionDiversity < 0.3 && trajectory.turnCount >= 3) {
|
|
741
|
+
lines.push(`- WARNING: Learner questions show low diversity — they may be stuck on the same concept`);
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
return lines.join('\n');
|
|
745
|
+
}
|
|
746
|
+
|
|
467
747
|
/**
|
|
468
748
|
* Format a previous turn for inclusion in context
|
|
469
749
|
*/
|
|
@@ -649,26 +929,69 @@ async function generateAndEvaluateTurn(context, resolvedConfig, turnMeta, option
|
|
|
649
929
|
log = () => {},
|
|
650
930
|
scenarioId = '',
|
|
651
931
|
systemPromptExtension = null,
|
|
932
|
+
superegoPromptExtension = null, // Dynamic disposition adjustments for superego
|
|
652
933
|
learnerId = null, // For Writing Pad memory persistence
|
|
934
|
+
dialecticalNegotiation = false, // Phase 2: AI-powered dialectical struggle
|
|
935
|
+
behavioralOverrides = null, // Quantitative params from superego self-reflection
|
|
936
|
+
dryRun = false,
|
|
653
937
|
} = options;
|
|
654
938
|
|
|
939
|
+
// Dry-run mode: return canned results without any API calls
|
|
940
|
+
if (dryRun) {
|
|
941
|
+
log('[dry-run] Generating mock suggestions (no API call)', 'info');
|
|
942
|
+
const genResult = mockGenerateResult(resolvedConfig, turnMeta);
|
|
943
|
+
const suggestion = genResult.suggestions?.[0];
|
|
944
|
+
const validation = suggestion
|
|
945
|
+
? rubricEvaluator.quickValidate(suggestion, {
|
|
946
|
+
requiredElements: turnMeta.requiredElements,
|
|
947
|
+
requiredElementsAny: turnMeta.requiredElementsAny,
|
|
948
|
+
forbiddenElements: turnMeta.forbiddenElements,
|
|
949
|
+
})
|
|
950
|
+
: { passesRequired: false, passesForbidden: true, requiredMissing: ['No suggestions generated'] };
|
|
951
|
+
|
|
952
|
+
let rubricResult = null;
|
|
953
|
+
let turnScore = null;
|
|
954
|
+
let scoringMethod = 'skipped';
|
|
955
|
+
if (!skipRubricEval && suggestion) {
|
|
956
|
+
log('[dry-run] Generating mock judge scores (no API call)', 'info');
|
|
957
|
+
rubricResult = mockJudgeResult(resolvedConfig, scenarioId + Date.now());
|
|
958
|
+
turnScore = rubricResult.overallScore;
|
|
959
|
+
scoringMethod = 'rubric';
|
|
960
|
+
}
|
|
961
|
+
|
|
962
|
+
return { genResult, suggestion, validation, rubricResult, turnScore, scoringMethod };
|
|
963
|
+
}
|
|
964
|
+
|
|
655
965
|
// Generate suggestions via tutor API with retry logic
|
|
966
|
+
// Note: retryWithBackoff handles thrown errors, but tutorApi.generateSuggestions()
|
|
967
|
+
// catches its own errors and returns { success: false }. We need to also handle
|
|
968
|
+
// 429 rate limit errors returned in the result (not thrown).
|
|
656
969
|
const genResult = await retryWithBackoff(
|
|
657
|
-
() =>
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
970
|
+
async () => {
|
|
971
|
+
const result = await tutorApi.generateSuggestions(context, {
|
|
972
|
+
provider: resolvedConfig.provider,
|
|
973
|
+
model: resolvedConfig.model,
|
|
974
|
+
egoModel: resolvedConfig.egoModel,
|
|
975
|
+
superegoModel: resolvedConfig.superegoModel || null,
|
|
976
|
+
profileName: resolvedConfig.profileName,
|
|
977
|
+
hyperparameters: resolvedConfig.hyperparameters || {},
|
|
978
|
+
trace: true,
|
|
979
|
+
superegoStrategy,
|
|
980
|
+
outputSize,
|
|
981
|
+
useDialogue,
|
|
982
|
+
maxRounds,
|
|
983
|
+
systemPromptExtension,
|
|
984
|
+
superegoPromptExtension, // Dynamic disposition adjustments for superego
|
|
985
|
+
learnerId, // Activates Writing Pad three-layer memory
|
|
986
|
+
dialecticalNegotiation, // Phase 2: AI-powered dialectical struggle
|
|
987
|
+
behavioralOverrides, // Quantitative params from superego self-reflection
|
|
988
|
+
});
|
|
989
|
+
// Re-throw 429 errors so retryWithBackoff can handle them
|
|
990
|
+
if (!result.success && result.error && (result.error.includes('429') || result.error.toLowerCase().includes('rate limit'))) {
|
|
991
|
+
throw new Error(result.error);
|
|
992
|
+
}
|
|
993
|
+
return result;
|
|
994
|
+
},
|
|
672
995
|
{ log }
|
|
673
996
|
);
|
|
674
997
|
|
|
@@ -776,9 +1099,13 @@ export async function runEvaluation(options = {}) {
|
|
|
776
1099
|
description = null,
|
|
777
1100
|
verbose = false,
|
|
778
1101
|
scenarioFilter = null, // Cluster filter: 'single-turn', 'multi-turn', or category names
|
|
779
|
-
modelOverride = null, // CLI --model override (e.g. "openrouter.nemotron")
|
|
780
|
-
egoModelOverride = null, // CLI --ego-model override (replaces only ego model)
|
|
781
|
-
superegoModelOverride = null, // CLI --superego-model override (replaces only superego model)
|
|
1102
|
+
modelOverride = null, // CLI --model override (e.g. "openrouter.nemotron") — ALL agents
|
|
1103
|
+
egoModelOverride = null, // CLI --ego-model override (replaces only tutor ego model)
|
|
1104
|
+
superegoModelOverride = null, // CLI --superego-model override (replaces only tutor superego model)
|
|
1105
|
+
learnerModelOverride = null, // CLI --learner-model override (replaces all learner agent models)
|
|
1106
|
+
dryRun = false, // Use mock data instead of API calls
|
|
1107
|
+
transcriptMode = false, // Write play-format transcript files during multi-turn runs
|
|
1108
|
+
maxTokensOverride = null, // CLI --max-tokens override (replaces ego max_tokens hyperparameter)
|
|
782
1109
|
} = options;
|
|
783
1110
|
|
|
784
1111
|
const log = verbose ? console.log : () => {};
|
|
@@ -856,6 +1183,7 @@ export async function runEvaluation(options = {}) {
|
|
|
856
1183
|
const effectiveModelOverride = modelOverride || yamlOverrides.modelOverride;
|
|
857
1184
|
const effectiveEgoModelOverride = egoModelOverride || yamlOverrides.egoModelOverride;
|
|
858
1185
|
const effectiveSuperegoModelOverride = superegoModelOverride || yamlOverrides.superegoModelOverride;
|
|
1186
|
+
const effectiveLearnerModelOverride = learnerModelOverride || null;
|
|
859
1187
|
|
|
860
1188
|
if (effectiveModelOverride) {
|
|
861
1189
|
targetConfigs = targetConfigs.map(c => ({ ...c, modelOverride: effectiveModelOverride }));
|
|
@@ -866,6 +1194,12 @@ export async function runEvaluation(options = {}) {
|
|
|
866
1194
|
if (effectiveSuperegoModelOverride) {
|
|
867
1195
|
targetConfigs = targetConfigs.map(c => ({ ...c, superegoModelOverride: effectiveSuperegoModelOverride }));
|
|
868
1196
|
}
|
|
1197
|
+
if (effectiveLearnerModelOverride) {
|
|
1198
|
+
targetConfigs = targetConfigs.map(c => ({ ...c, learnerModelOverride: effectiveLearnerModelOverride }));
|
|
1199
|
+
}
|
|
1200
|
+
if (maxTokensOverride) {
|
|
1201
|
+
targetConfigs = targetConfigs.map(c => ({ ...c, maxTokensOverride }));
|
|
1202
|
+
}
|
|
869
1203
|
|
|
870
1204
|
if (targetConfigs.length === 0) {
|
|
871
1205
|
throw new Error('No configurations to test');
|
|
@@ -888,6 +1222,8 @@ export async function runEvaluation(options = {}) {
|
|
|
888
1222
|
modelOverride: effectiveModelOverride || null,
|
|
889
1223
|
egoModelOverride: effectiveEgoModelOverride || null,
|
|
890
1224
|
superegoModelOverride: effectiveSuperegoModelOverride || null,
|
|
1225
|
+
learnerModelOverride: effectiveLearnerModelOverride || null,
|
|
1226
|
+
maxTokensOverride: maxTokensOverride || null,
|
|
891
1227
|
// Store scenario IDs and profile names for accurate resume
|
|
892
1228
|
scenarioIds: targetScenarios.map(s => s.id),
|
|
893
1229
|
profileNames: targetConfigs.map(c => c.profileName).filter(Boolean),
|
|
@@ -1002,6 +1338,9 @@ export async function runEvaluation(options = {}) {
|
|
|
1002
1338
|
const result = await runSingleTest(scenario, config, {
|
|
1003
1339
|
skipRubricEval,
|
|
1004
1340
|
verbose,
|
|
1341
|
+
dryRun,
|
|
1342
|
+
transcriptMode,
|
|
1343
|
+
runId: run.id,
|
|
1005
1344
|
});
|
|
1006
1345
|
|
|
1007
1346
|
// Store result (better-sqlite3 is synchronous, thread-safe for concurrent writes)
|
|
@@ -1071,30 +1410,38 @@ export async function runEvaluation(options = {}) {
|
|
|
1071
1410
|
completedTests++;
|
|
1072
1411
|
log(` ${formatProgress(completedTests, totalTests, runStartTime)} ${profileLabel} / ${scenario.id}: ERROR - ${error.message}`);
|
|
1073
1412
|
|
|
1074
|
-
//
|
|
1075
|
-
//
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
: config.
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
: config.
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1413
|
+
// Only store failed results for permanent errors (bad config, invalid scenario).
|
|
1414
|
+
// Skip storing for retriable/transient errors (rate limits, model unavailable, timeouts)
|
|
1415
|
+
// so that `resume` can retry them without needing manual cleanup.
|
|
1416
|
+
const errMsg = error.message || '';
|
|
1417
|
+
const isTransient = /429|rate limit|too many requests|503|502|timeout|ECONNREFUSED|ECONNRESET|ETIMEDOUT|terminated|unavailable|failed to generate suggestions/i.test(errMsg);
|
|
1418
|
+
|
|
1419
|
+
if (!isTransient) {
|
|
1420
|
+
const failedResult = {
|
|
1421
|
+
scenarioId: scenario.id,
|
|
1422
|
+
scenarioName: scenario.name || scenario.id,
|
|
1423
|
+
profileName: config.profileName,
|
|
1424
|
+
provider: config.provider || config.ego?.provider || 'unknown',
|
|
1425
|
+
model: config.model || config.ego?.model || 'unknown',
|
|
1426
|
+
egoModel: config.egoModel
|
|
1427
|
+
? `${config.egoModel.provider}.${config.egoModel.model}`
|
|
1428
|
+
: config.ego ? `${config.ego.provider}.${config.ego.model}` : null,
|
|
1429
|
+
superegoModel: config.superegoModel
|
|
1430
|
+
? `${config.superegoModel.provider}.${config.superegoModel.model}`
|
|
1431
|
+
: config.superego ? `${config.superego.provider}.${config.superego.model}` : null,
|
|
1432
|
+
factors: config.factors || null,
|
|
1433
|
+
learnerArchitecture: config.learnerArchitecture || null,
|
|
1434
|
+
success: false,
|
|
1435
|
+
errorMessage: error.message,
|
|
1436
|
+
};
|
|
1437
|
+
try {
|
|
1438
|
+
evaluationStore.storeResult(run.id, failedResult);
|
|
1439
|
+
results.push(failedResult);
|
|
1440
|
+
} catch (storeErr) {
|
|
1441
|
+
log(` [WARNING] Failed to store error result: ${storeErr.message}`);
|
|
1442
|
+
}
|
|
1443
|
+
} else {
|
|
1444
|
+
log(` [SKIPPED] Transient error, not storing empty row (resumable): ${errMsg.substring(0, 100)}`);
|
|
1098
1445
|
}
|
|
1099
1446
|
|
|
1100
1447
|
// Emit test_error event
|
|
@@ -1183,7 +1530,7 @@ export async function runEvaluation(options = {}) {
|
|
|
1183
1530
|
* Handles both single-turn and multi-turn scenarios
|
|
1184
1531
|
*/
|
|
1185
1532
|
async function runSingleTest(scenario, config, options = {}) {
|
|
1186
|
-
const { skipRubricEval = false, outputSize = 'normal', verbose = false, onLog, superegoStrategy = null, judgeOverride = null } = options;
|
|
1533
|
+
const { skipRubricEval = false, outputSize = 'normal', verbose = false, onLog, superegoStrategy = null, judgeOverride = null, dryRun = false } = options;
|
|
1187
1534
|
|
|
1188
1535
|
// Create a log function that calls both console and onLog callback
|
|
1189
1536
|
const log = (message, level = 'info') => {
|
|
@@ -1214,7 +1561,7 @@ async function runSingleTest(scenario, config, options = {}) {
|
|
|
1214
1561
|
* Run a single-turn test
|
|
1215
1562
|
*/
|
|
1216
1563
|
async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
|
|
1217
|
-
const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null, judgeOverride = null } = options;
|
|
1564
|
+
const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null, judgeOverride = null, dryRun = false } = options;
|
|
1218
1565
|
|
|
1219
1566
|
// Resolve model aliases through eval's providers.yaml
|
|
1220
1567
|
const resolvedConfig = resolveConfigModels(config);
|
|
@@ -1260,7 +1607,7 @@ async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
1260
1607
|
requiredElementsAny: fullScenario.required_elements_any,
|
|
1261
1608
|
forbiddenElements: fullScenario.forbidden_elements,
|
|
1262
1609
|
},
|
|
1263
|
-
{ skipRubricEval, outputSize, superegoStrategy, judgeOverride, useDialogue, maxRounds, log, scenarioId: scenario.id }
|
|
1610
|
+
{ skipRubricEval, outputSize, superegoStrategy, judgeOverride, useDialogue, maxRounds, log, scenarioId: scenario.id, dryRun }
|
|
1264
1611
|
);
|
|
1265
1612
|
|
|
1266
1613
|
if (!genResult.success) {
|
|
@@ -1296,7 +1643,7 @@ async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
1296
1643
|
superegoModel: resolvedConfig.superegoModel
|
|
1297
1644
|
? `${resolvedConfig.superegoModel.provider}.${resolvedConfig.superegoModel.model}`
|
|
1298
1645
|
: null,
|
|
1299
|
-
hyperparameters: config.hyperparameters,
|
|
1646
|
+
hyperparameters: resolvedConfig.hyperparameters || config.hyperparameters,
|
|
1300
1647
|
suggestions: genResult.suggestions,
|
|
1301
1648
|
success: true,
|
|
1302
1649
|
latencyMs: genResult.metadata?.latencyMs,
|
|
@@ -1346,7 +1693,7 @@ async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
1346
1693
|
* This eliminates the separate multiTurnRunner orchestration.
|
|
1347
1694
|
*/
|
|
1348
1695
|
async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
1349
|
-
const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null, judgeOverride = null } = options;
|
|
1696
|
+
const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null, judgeOverride = null, dryRun = false, transcriptMode = false, runId = null } = options;
|
|
1350
1697
|
|
|
1351
1698
|
log(`[evaluationRunner] Running multi-turn scenario: ${scenario.id}`);
|
|
1352
1699
|
|
|
@@ -1371,6 +1718,41 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
1371
1718
|
const learnerId = `eval-learner-${dialogueId}-${scenario.id.replace(/[^a-zA-Z0-9]/g, '')}`;
|
|
1372
1719
|
log(`[evaluationRunner] Generated learnerId for Writing Pad: ${learnerId}`, 'info');
|
|
1373
1720
|
|
|
1721
|
+
// Set up transcript file for incremental writing (tail -f friendly)
|
|
1722
|
+
let transcriptPath = null;
|
|
1723
|
+
if (transcriptMode) {
|
|
1724
|
+
const effectiveRunId = runId || 'live';
|
|
1725
|
+
const transcriptDir = path.join(TRANSCRIPTS_DIR, effectiveRunId);
|
|
1726
|
+
if (!fs.existsSync(transcriptDir)) fs.mkdirSync(transcriptDir, { recursive: true });
|
|
1727
|
+
const safeName = `${config.profileName}--${scenario.id}`.replace(/[^a-zA-Z0-9_-]/g, '_');
|
|
1728
|
+
transcriptPath = path.join(transcriptDir, `${safeName}.txt`);
|
|
1729
|
+
// Write header
|
|
1730
|
+
const totalTurnCount = 1 + (fullScenario.turns || []).length;
|
|
1731
|
+
const header = `\n${(fullScenario.name || scenario.id).toUpperCase()} (${totalTurnCount}-turn)\n${config.profileName}\n${'─'.repeat(40)}\n\n`;
|
|
1732
|
+
fs.writeFileSync(transcriptPath, header);
|
|
1733
|
+
log(`[evaluationRunner] Transcript: ${transcriptPath}`, 'info');
|
|
1734
|
+
}
|
|
1735
|
+
|
|
1736
|
+
// Helper: append new trace entries to transcript file and optionally console
|
|
1737
|
+
let lastTranscriptIdx = 0;
|
|
1738
|
+
function flushTranscript() {
|
|
1739
|
+
if (!transcriptMode || !transcriptPath) return;
|
|
1740
|
+
const newEntries = consolidatedTrace.slice(lastTranscriptIdx);
|
|
1741
|
+
if (newEntries.length === 0) return;
|
|
1742
|
+
lastTranscriptIdx = consolidatedTrace.length;
|
|
1743
|
+
const lines = [];
|
|
1744
|
+
for (const entry of newEntries) {
|
|
1745
|
+
const formatted = formatEntry(entry, { detail: 'play' });
|
|
1746
|
+
if (formatted) lines.push(formatted + '\n');
|
|
1747
|
+
// Also print compact line to console in transcript mode
|
|
1748
|
+
const compactLine = formatCompactLine(entry);
|
|
1749
|
+
if (compactLine) console.log(compactLine);
|
|
1750
|
+
}
|
|
1751
|
+
if (lines.length > 0) {
|
|
1752
|
+
fs.appendFileSync(transcriptPath, lines.join('\n'));
|
|
1753
|
+
}
|
|
1754
|
+
}
|
|
1755
|
+
|
|
1374
1756
|
// Deep-clone turns to prevent mutation of shared scenario objects across profiles
|
|
1375
1757
|
const turns = JSON.parse(JSON.stringify(fullScenario.turns || []));
|
|
1376
1758
|
const turnResults = [];
|
|
@@ -1384,14 +1766,58 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
1384
1766
|
let conversationHistory = [];
|
|
1385
1767
|
let previousSuggestion = null;
|
|
1386
1768
|
const consolidatedTrace = [];
|
|
1769
|
+
const priorSuperegoAssessments = []; // Cross-turn superego memory
|
|
1387
1770
|
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
// Check if prompt rewriting is enabled for this profile
|
|
1771
|
+
// Check profile-level feature flags
|
|
1391
1772
|
const rawProfile = evalConfigLoader.loadTutorAgents()?.profiles?.[config.profileName];
|
|
1773
|
+
|
|
1774
|
+
// Apply CLI model override to rawProfile so prompt rewriter calls use the correct model.
|
|
1775
|
+
// Without this, --model/--ego-model only affects tutor-core's generateSuggestions,
|
|
1776
|
+
// while promptRewriter functions (self-reflection, profiling, etc.) still use the YAML model.
|
|
1777
|
+
if (config.modelOverride || config.egoModelOverride) {
|
|
1778
|
+
const overrideModel = config.egoModelOverride || config.modelOverride;
|
|
1779
|
+
try {
|
|
1780
|
+
const r = evalConfigLoader.resolveModel(overrideModel);
|
|
1781
|
+
if (rawProfile?.ego) {
|
|
1782
|
+
rawProfile.ego = { ...rawProfile.ego, provider: r.provider, model: r.model };
|
|
1783
|
+
}
|
|
1784
|
+
// Also update top-level model for functions that read config.model
|
|
1785
|
+
if (rawProfile) rawProfile.model = r.model;
|
|
1786
|
+
} catch { /* leave rawProfile as-is if resolution fails */ }
|
|
1787
|
+
}
|
|
1788
|
+
if (config.modelOverride || config.superegoModelOverride) {
|
|
1789
|
+
const overrideModel = config.superegoModelOverride || config.modelOverride;
|
|
1790
|
+
try {
|
|
1791
|
+
const r = evalConfigLoader.resolveModel(overrideModel);
|
|
1792
|
+
if (rawProfile?.superego) {
|
|
1793
|
+
rawProfile.superego = { ...rawProfile.superego, provider: r.provider, model: r.model };
|
|
1794
|
+
}
|
|
1795
|
+
} catch { /* leave rawProfile as-is if resolution fails */ }
|
|
1796
|
+
}
|
|
1797
|
+
|
|
1798
|
+
const dialecticalNegotiation = rawProfile?.dialectical_negotiation ?? false;
|
|
1392
1799
|
const promptRewritingEnabled = rawProfile?.prompt_rewriting?.enabled ?? false;
|
|
1393
1800
|
const promptRewritingStrategy = rawProfile?.prompt_rewriting?.strategy ?? 'template';
|
|
1801
|
+
const superegoDispositionRewriting = rawProfile?.superego_disposition_rewriting ?? false;
|
|
1802
|
+
const quantitativeDispositionEnabled = rawProfile?.prompt_rewriting?.quantitative_disposition ?? false;
|
|
1803
|
+
const promptErosionEnabled = rawProfile?.prompt_rewriting?.prompt_erosion?.enabled ?? false;
|
|
1804
|
+
const intersubjectiveEnabled = rawProfile?.prompt_rewriting?.intersubjective ?? false;
|
|
1805
|
+
const otherEgoProfilingEnabled = rawProfile?.other_ego_profiling?.enabled ?? false;
|
|
1806
|
+
const otherEgoBidirectional = rawProfile?.other_ego_profiling?.bidirectional ?? false;
|
|
1807
|
+
const strategyPlanningEnabled = rawProfile?.other_ego_profiling?.strategy_planning ?? false;
|
|
1808
|
+
|
|
1809
|
+
const sharedTurnOptions = { skipRubricEval, outputSize, superegoStrategy, judgeOverride, useDialogue, maxRounds, log, scenarioId: scenario.id, learnerId, dialecticalNegotiation, dryRun };
|
|
1394
1810
|
let sessionEvolution = null;
|
|
1811
|
+
let superegoEvolution = null;
|
|
1812
|
+
let behavioralOverrides = null; // Parsed quantitative params from superego self-reflection
|
|
1813
|
+
let tutorProfileOfLearner = null; // Other-ego: tutor's mental model of learner
|
|
1814
|
+
let learnerProfileOfTutor = null; // Other-ego: learner's mental model of tutor
|
|
1815
|
+
let strategyPlan = null; // Other-ego: ego's explicit strategy plan
|
|
1816
|
+
|
|
1817
|
+
// Per-dialogue rejection budget: limits total superego rejections across all turns
|
|
1818
|
+
// to prevent worst-case cascade (e.g., 3 rejections × 5 turns = 15 total)
|
|
1819
|
+
let rejectionBudget = rawProfile?.dialogue?.rejection_budget ?? null; // null = unlimited (backwards-compatible)
|
|
1820
|
+
let totalRejections = 0;
|
|
1395
1821
|
|
|
1396
1822
|
// 4. Loop through turns (initial turn 0 + follow-up turns)
|
|
1397
1823
|
const totalTurnCount = 1 + turns.length;
|
|
@@ -1401,6 +1827,19 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
1401
1827
|
|
|
1402
1828
|
log(`[evaluationRunner] Turn ${turnIdx}/${totalTurnCount - 1}${isInitialTurn ? ' (initial)' : ` (${turnDef.id})`}`, 'info');
|
|
1403
1829
|
|
|
1830
|
+
// Update run metadata with current turn progress for `runs` command
|
|
1831
|
+
if (runId) {
|
|
1832
|
+
evaluationStore.updateRun(runId, {
|
|
1833
|
+
metadata: {
|
|
1834
|
+
turnProgress: {
|
|
1835
|
+
current: turnIdx + 1,
|
|
1836
|
+
total: totalTurnCount,
|
|
1837
|
+
scenarioId: scenario.id,
|
|
1838
|
+
}
|
|
1839
|
+
}
|
|
1840
|
+
});
|
|
1841
|
+
}
|
|
1842
|
+
|
|
1404
1843
|
// Show learner action in transcript mode (for follow-up turns)
|
|
1405
1844
|
if (!isInitialTurn && dialogueEngine.isTranscriptMode()) {
|
|
1406
1845
|
dialogueEngine.transcript('LEARNER ACTION', formatLearnerActionForTranscript(turnDef));
|
|
@@ -1420,11 +1859,16 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
1420
1859
|
learnerMessage: turnDef.action_details?.message,
|
|
1421
1860
|
});
|
|
1422
1861
|
|
|
1862
|
+
// Build learner trajectory assessment from accumulated turn data
|
|
1863
|
+
const learnerTrajectory = analyzeLearnerTrajectory(turnResults, conversationHistory);
|
|
1864
|
+
|
|
1423
1865
|
contextStr = buildMultiTurnContext({
|
|
1424
1866
|
originalContext: fullScenario.learner_context,
|
|
1425
1867
|
conversationHistory,
|
|
1426
1868
|
currentTurn: turnDef,
|
|
1427
1869
|
previousSuggestion,
|
|
1870
|
+
priorSuperegoAssessments,
|
|
1871
|
+
learnerTrajectory,
|
|
1428
1872
|
});
|
|
1429
1873
|
}
|
|
1430
1874
|
|
|
@@ -1455,11 +1899,46 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
1455
1899
|
: (turnDef.forbidden_elements || []),
|
|
1456
1900
|
};
|
|
1457
1901
|
|
|
1902
|
+
// Build the ego prompt extension: erosion frame + session evolution (reflections)
|
|
1903
|
+
let fullEgoExtension = sessionEvolution;
|
|
1904
|
+
if (promptErosionEnabled && turnIdx > 0) {
|
|
1905
|
+
const erosionFrame = promptRewriter.buildPromptErosionFrame(turnIdx, rawProfile);
|
|
1906
|
+
if (erosionFrame) {
|
|
1907
|
+
// Erosion frame goes BEFORE reflections, so the model sees authority calibration first
|
|
1908
|
+
fullEgoExtension = erosionFrame + (sessionEvolution ? '\n\n' + sessionEvolution : '');
|
|
1909
|
+
log(`[evaluationRunner] Prompt erosion frame applied for turn ${turnIdx} (rate=${rawProfile.prompt_rewriting?.prompt_erosion?.rate ?? 0.2})`, 'info');
|
|
1910
|
+
}
|
|
1911
|
+
}
|
|
1912
|
+
|
|
1913
|
+
// Append other-ego profile and strategy plan to ego extension
|
|
1914
|
+
// Injection order: erosion frame → self-reflection → other-ego profile → strategy plan
|
|
1915
|
+
if (otherEgoProfilingEnabled && tutorProfileOfLearner) {
|
|
1916
|
+
const profileBlock = promptRewriter.formatProfileForInjection(tutorProfileOfLearner, 'learner');
|
|
1917
|
+
fullEgoExtension = (fullEgoExtension ? fullEgoExtension + '\n\n' : '') + profileBlock;
|
|
1918
|
+
}
|
|
1919
|
+
if (strategyPlanningEnabled && strategyPlan) {
|
|
1920
|
+
fullEgoExtension = (fullEgoExtension ? fullEgoExtension + '\n\n' : '') + strategyPlan;
|
|
1921
|
+
}
|
|
1922
|
+
|
|
1923
|
+
// Build the superego prompt extension: erosion frame + superego evolution (reflections)
|
|
1924
|
+
let fullSuperegoExtension = superegoEvolution;
|
|
1925
|
+
if (promptErosionEnabled && turnIdx > 0 && superegoEvolution) {
|
|
1926
|
+
const erosionFrame = promptRewriter.buildPromptErosionFrame(turnIdx, rawProfile);
|
|
1927
|
+
if (erosionFrame) {
|
|
1928
|
+
fullSuperegoExtension = erosionFrame + '\n\n' + superegoEvolution;
|
|
1929
|
+
}
|
|
1930
|
+
}
|
|
1931
|
+
|
|
1458
1932
|
// Call the SAME generation+evaluation code path as single-turn
|
|
1459
1933
|
// Pass dialogue context so the judge can see the full exchange
|
|
1934
|
+
// When rejection budget is exhausted, also skip outer superego review loop (maxRounds: 0)
|
|
1935
|
+
const budgetExhausted = rejectionBudget !== null && totalRejections >= rejectionBudget;
|
|
1460
1936
|
const turnOptions = {
|
|
1461
1937
|
...sharedTurnOptions,
|
|
1462
|
-
...(
|
|
1938
|
+
...(fullEgoExtension ? { systemPromptExtension: fullEgoExtension } : {}),
|
|
1939
|
+
...(fullSuperegoExtension ? { superegoPromptExtension: fullSuperegoExtension } : {}),
|
|
1940
|
+
...(behavioralOverrides ? { behavioralOverrides } : {}),
|
|
1941
|
+
...(budgetExhausted ? { maxRounds: 0 } : {}),
|
|
1463
1942
|
conversationHistory: conversationHistory.length > 0 ? conversationHistory : null,
|
|
1464
1943
|
consolidatedTrace: consolidatedTrace.length > 0 ? consolidatedTrace : null,
|
|
1465
1944
|
};
|
|
@@ -1468,7 +1947,7 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
1468
1947
|
|
|
1469
1948
|
if (!genResult.success) {
|
|
1470
1949
|
const turnId = isInitialTurn ? 'initial' : turnDef.id;
|
|
1471
|
-
throw new Error(`Multi-turn scenario ${scenario.id}: Turn ${turnIdx} (${turnId}) failed to generate suggestions`);
|
|
1950
|
+
throw new Error(`Multi-turn scenario ${scenario.id}: Turn ${turnIdx} (${turnId}) failed to generate suggestions: ${genResult.error || 'unknown error'}`);
|
|
1472
1951
|
}
|
|
1473
1952
|
|
|
1474
1953
|
// Accumulate dialogue traces
|
|
@@ -1506,6 +1985,39 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
1506
1985
|
}
|
|
1507
1986
|
}
|
|
1508
1987
|
|
|
1988
|
+
// Flush transcript: ego/superego exchange for this turn
|
|
1989
|
+
flushTranscript();
|
|
1990
|
+
|
|
1991
|
+
// Accumulate cross-turn superego memory from this turn's trace
|
|
1992
|
+
if (genResult.dialogueTrace && genResult.dialogueTrace.length > 0) {
|
|
1993
|
+
const assessment = extractTurnSuperegoAssessment(turnIdx, genResult.dialogueTrace);
|
|
1994
|
+
if (assessment) {
|
|
1995
|
+
priorSuperegoAssessments.push(assessment);
|
|
1996
|
+
}
|
|
1997
|
+
}
|
|
1998
|
+
|
|
1999
|
+
// Track rejection budget across turns: count superego rejections in this turn's trace
|
|
2000
|
+
if (rejectionBudget !== null && genResult.dialogueTrace) {
|
|
2001
|
+
const turnRejections = genResult.dialogueTrace.filter(
|
|
2002
|
+
entry => entry.agent === 'superego' && entry.action === 'review' && entry.approved === false
|
|
2003
|
+
).length;
|
|
2004
|
+
totalRejections += turnRejections;
|
|
2005
|
+
|
|
2006
|
+
if (totalRejections >= rejectionBudget) {
|
|
2007
|
+
// Budget exhausted: force approve-only mode for remaining turns
|
|
2008
|
+
behavioralOverrides = { ...(behavioralOverrides || {}), max_rejections: 0 };
|
|
2009
|
+
log(`[evaluationRunner] Rejection budget exhausted (${totalRejections}/${rejectionBudget}): forcing approve-only for remaining turns`, 'info');
|
|
2010
|
+
consolidatedTrace.push({
|
|
2011
|
+
agent: 'rejection_budget',
|
|
2012
|
+
action: 'exhausted',
|
|
2013
|
+
turnIndex: turnIdx,
|
|
2014
|
+
contextSummary: `Budget exhausted: ${totalRejections}/${rejectionBudget} rejections used`,
|
|
2015
|
+
detail: `Total rejections across ${turnIdx + 1} turns: ${totalRejections}. Remaining turns will auto-approve.`,
|
|
2016
|
+
timestamp: new Date().toISOString(),
|
|
2017
|
+
});
|
|
2018
|
+
}
|
|
2019
|
+
}
|
|
2020
|
+
|
|
1509
2021
|
// Collect per-turn result
|
|
1510
2022
|
turnResults.push({
|
|
1511
2023
|
turnIndex: turnIdx,
|
|
@@ -1546,41 +2058,368 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
1546
2058
|
// Update for next iteration
|
|
1547
2059
|
previousSuggestion = suggestion;
|
|
1548
2060
|
|
|
1549
|
-
//
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
2061
|
+
// ── Between-turn processing ──────────────────────────────────────────
|
|
2062
|
+
// Parallelized into groups by dependency:
|
|
2063
|
+
// Group 1 (independent): ego self-refl, superego self-refl, tutor profile, learner profile
|
|
2064
|
+
// Group 2 (depends on group 1): intersubjective, quantitative parse, strategy plan
|
|
2065
|
+
// Group 3 (depends on group 2): learner generation
|
|
2066
|
+
// This collapses ~6-8 sequential LLM calls into ~3 parallel rounds.
|
|
2067
|
+
|
|
2068
|
+
if (turnIdx < totalTurnCount - 1) {
|
|
2069
|
+
const betweenTurnStart = Date.now();
|
|
2070
|
+
|
|
2071
|
+
// ── Group 1: Independent LLM calls in parallel ──────────────────
|
|
2072
|
+
const group1Promises = [];
|
|
2073
|
+
const group1Labels = [];
|
|
2074
|
+
|
|
2075
|
+
// Ego self-reflection / prompt rewriting
|
|
2076
|
+
if (promptRewritingEnabled) {
|
|
2077
|
+
if (promptRewritingStrategy === 'self_reflection') {
|
|
2078
|
+
group1Promises.push(
|
|
2079
|
+
promptRewriter.synthesizeEgoSelfReflection({
|
|
2080
|
+
turnResults,
|
|
2081
|
+
consolidatedTrace,
|
|
2082
|
+
conversationHistory,
|
|
2083
|
+
config: rawProfile,
|
|
2084
|
+
}).catch(error => {
|
|
2085
|
+
log(`[evaluationRunner] Ego self-reflection failed, will fall back to template: ${error.message}`, 'warn');
|
|
2086
|
+
return null;
|
|
2087
|
+
})
|
|
2088
|
+
);
|
|
2089
|
+
group1Labels.push('ego_self_reflection');
|
|
2090
|
+
} else if (promptRewritingStrategy === 'llm') {
|
|
2091
|
+
group1Promises.push(
|
|
2092
|
+
promptRewriter.synthesizeDirectivesLLM({
|
|
2093
|
+
turnResults,
|
|
2094
|
+
consolidatedTrace,
|
|
2095
|
+
conversationHistory,
|
|
2096
|
+
config: rawProfile,
|
|
2097
|
+
}).catch(error => {
|
|
2098
|
+
log(`[evaluationRunner] LLM rewriter failed, will fall back to template: ${error.message}`, 'warn');
|
|
2099
|
+
return null;
|
|
2100
|
+
})
|
|
2101
|
+
);
|
|
2102
|
+
group1Labels.push('llm_rewrite');
|
|
2103
|
+
}
|
|
2104
|
+
}
|
|
2105
|
+
|
|
2106
|
+
// Superego self-reflection / disposition rewriting
|
|
2107
|
+
if (superegoDispositionRewriting) {
|
|
2108
|
+
if (promptRewritingStrategy === 'self_reflection') {
|
|
2109
|
+
group1Promises.push(
|
|
2110
|
+
promptRewriter.synthesizeSupergoSelfReflection({
|
|
2111
|
+
turnResults,
|
|
2112
|
+
consolidatedTrace,
|
|
2113
|
+
conversationHistory,
|
|
2114
|
+
priorSuperegoAssessments,
|
|
2115
|
+
config: rawProfile,
|
|
2116
|
+
}).catch(error => {
|
|
2117
|
+
log(`[evaluationRunner] Superego self-reflection failed: ${error.message}`, 'warn');
|
|
2118
|
+
return null;
|
|
2119
|
+
})
|
|
2120
|
+
);
|
|
2121
|
+
group1Labels.push('superego_self_reflection');
|
|
2122
|
+
} else {
|
|
2123
|
+
group1Promises.push(
|
|
2124
|
+
promptRewriter.synthesizeSuperegoDisposition({
|
|
2125
|
+
turnResults,
|
|
2126
|
+
consolidatedTrace,
|
|
2127
|
+
conversationHistory,
|
|
2128
|
+
priorSuperegoAssessments,
|
|
2129
|
+
config: rawProfile,
|
|
2130
|
+
}).catch(error => {
|
|
2131
|
+
log(`[evaluationRunner] Superego disposition rewriting failed: ${error.message}`, 'warn');
|
|
2132
|
+
return null;
|
|
2133
|
+
})
|
|
2134
|
+
);
|
|
2135
|
+
group1Labels.push('superego_disposition');
|
|
2136
|
+
}
|
|
2137
|
+
}
|
|
2138
|
+
|
|
2139
|
+
// Tutor profiles learner (Theory of Mind)
|
|
2140
|
+
if (otherEgoProfilingEnabled) {
|
|
2141
|
+
group1Promises.push(
|
|
2142
|
+
promptRewriter.synthesizeTutorProfileOfLearner({
|
|
1555
2143
|
turnResults,
|
|
1556
2144
|
consolidatedTrace,
|
|
1557
2145
|
conversationHistory,
|
|
2146
|
+
priorProfile: tutorProfileOfLearner,
|
|
1558
2147
|
config: rawProfile,
|
|
1559
|
-
})
|
|
2148
|
+
}).catch(error => {
|
|
2149
|
+
log(`[evaluationRunner] Tutor profile of learner failed: ${error.message}`, 'warn');
|
|
2150
|
+
return null;
|
|
2151
|
+
})
|
|
2152
|
+
);
|
|
2153
|
+
group1Labels.push('tutor_profile');
|
|
2154
|
+
}
|
|
2155
|
+
|
|
2156
|
+
// Learner profiles tutor (bidirectional Theory of Mind)
|
|
2157
|
+
if (otherEgoProfilingEnabled && otherEgoBidirectional) {
|
|
2158
|
+
group1Promises.push(
|
|
2159
|
+
promptRewriter.synthesizeLearnerProfileOfTutor({
|
|
2160
|
+
turnResults,
|
|
2161
|
+
consolidatedTrace,
|
|
2162
|
+
conversationHistory,
|
|
2163
|
+
priorProfile: learnerProfileOfTutor,
|
|
2164
|
+
config: rawProfile,
|
|
2165
|
+
}).catch(error => {
|
|
2166
|
+
log(`[evaluationRunner] Learner profile of tutor failed: ${error.message}`, 'warn');
|
|
2167
|
+
return null;
|
|
2168
|
+
})
|
|
2169
|
+
);
|
|
2170
|
+
group1Labels.push('learner_profile');
|
|
2171
|
+
}
|
|
2172
|
+
|
|
2173
|
+
// Fire all group 1 calls in parallel
|
|
2174
|
+
const group1Results = await Promise.all(group1Promises);
|
|
2175
|
+
const group1Map = {};
|
|
2176
|
+
group1Labels.forEach((label, i) => { group1Map[label] = group1Results[i]; });
|
|
2177
|
+
|
|
2178
|
+
// ── Process group 1 results ─────────────────────────────────────
|
|
2179
|
+
|
|
2180
|
+
// Ego self-reflection / prompt rewriting result
|
|
2181
|
+
if (promptRewritingEnabled) {
|
|
2182
|
+
if (promptRewritingStrategy === 'self_reflection') {
|
|
2183
|
+
const egoReflResult = group1Map['ego_self_reflection'];
|
|
2184
|
+
sessionEvolution = egoReflResult?.text ?? null;
|
|
2185
|
+
if (sessionEvolution) {
|
|
2186
|
+
log(`[evaluationRunner] Ego self-reflection generated for turn ${turnIdx + 1}`, 'info');
|
|
2187
|
+
consolidatedTrace.push({
|
|
2188
|
+
agent: 'ego_self_reflection',
|
|
2189
|
+
action: 'rewrite',
|
|
2190
|
+
turnIndex: turnIdx,
|
|
2191
|
+
contextSummary: `Ego self-reflection generated for turn ${turnIdx + 1}`,
|
|
2192
|
+
detail: sessionEvolution,
|
|
2193
|
+
metrics: egoReflResult?.metrics ?? null,
|
|
2194
|
+
timestamp: new Date().toISOString(),
|
|
2195
|
+
});
|
|
2196
|
+
} else {
|
|
2197
|
+
log(`[evaluationRunner] Ego self-reflection returned empty, falling back to template for turn ${turnIdx + 1}`, 'warn');
|
|
2198
|
+
sessionEvolution = promptRewriter.synthesizeDirectives({
|
|
2199
|
+
turnResults,
|
|
2200
|
+
consolidatedTrace,
|
|
2201
|
+
conversationHistory,
|
|
2202
|
+
});
|
|
2203
|
+
}
|
|
2204
|
+
} else if (promptRewritingStrategy === 'llm') {
|
|
2205
|
+
const llmResult = group1Map['llm_rewrite'];
|
|
2206
|
+
sessionEvolution = llmResult?.text ?? null;
|
|
1560
2207
|
if (sessionEvolution) {
|
|
1561
2208
|
log(`[evaluationRunner] LLM rewriter generated directives for turn ${turnIdx + 1}`, 'info');
|
|
2209
|
+
} else {
|
|
2210
|
+
log(`[evaluationRunner] LLM rewriter returned empty, falling back to template for turn ${turnIdx + 1}`, 'warn');
|
|
2211
|
+
sessionEvolution = promptRewriter.synthesizeDirectives({
|
|
2212
|
+
turnResults,
|
|
2213
|
+
consolidatedTrace,
|
|
2214
|
+
conversationHistory,
|
|
2215
|
+
});
|
|
1562
2216
|
}
|
|
1563
|
-
}
|
|
1564
|
-
|
|
2217
|
+
} else {
|
|
2218
|
+
// Template-based directive synthesis (deterministic, no LLM call)
|
|
1565
2219
|
sessionEvolution = promptRewriter.synthesizeDirectives({
|
|
1566
2220
|
turnResults,
|
|
1567
2221
|
consolidatedTrace,
|
|
1568
2222
|
conversationHistory,
|
|
1569
2223
|
});
|
|
1570
2224
|
}
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
turnResults,
|
|
1575
|
-
consolidatedTrace,
|
|
1576
|
-
conversationHistory,
|
|
1577
|
-
});
|
|
2225
|
+
if (sessionEvolution) {
|
|
2226
|
+
log(`[evaluationRunner] Prompt rewriter (${promptRewritingStrategy}) generated ${sessionEvolution.split('\n').length - 2} directives for turn ${turnIdx + 1}`, 'info');
|
|
2227
|
+
}
|
|
1578
2228
|
}
|
|
1579
|
-
|
|
1580
|
-
|
|
2229
|
+
|
|
2230
|
+
// Superego self-reflection / disposition result
|
|
2231
|
+
if (superegoDispositionRewriting) {
|
|
2232
|
+
if (promptRewritingStrategy === 'self_reflection') {
|
|
2233
|
+
const seReflResult = group1Map['superego_self_reflection'];
|
|
2234
|
+
superegoEvolution = seReflResult?.text ?? null;
|
|
2235
|
+
if (superegoEvolution) {
|
|
2236
|
+
log(`[evaluationRunner] Superego self-reflection generated for turn ${turnIdx + 1}`, 'info');
|
|
2237
|
+
consolidatedTrace.push({
|
|
2238
|
+
agent: 'superego_self_reflection',
|
|
2239
|
+
action: 'rewrite',
|
|
2240
|
+
turnIndex: turnIdx,
|
|
2241
|
+
contextSummary: `Superego self-reflection generated for turn ${turnIdx + 1}`,
|
|
2242
|
+
detail: superegoEvolution,
|
|
2243
|
+
metrics: seReflResult?.metrics ?? null,
|
|
2244
|
+
timestamp: new Date().toISOString(),
|
|
2245
|
+
});
|
|
2246
|
+
} else {
|
|
2247
|
+
// Self-reflection returned empty — fall back to LLM disposition rewriting
|
|
2248
|
+
log(`[evaluationRunner] Superego self-reflection returned empty, falling back to LLM disposition for turn ${turnIdx + 1}`, 'warn');
|
|
2249
|
+
try {
|
|
2250
|
+
const dispFallback = await promptRewriter.synthesizeSuperegoDisposition({
|
|
2251
|
+
turnResults,
|
|
2252
|
+
consolidatedTrace,
|
|
2253
|
+
conversationHistory,
|
|
2254
|
+
priorSuperegoAssessments,
|
|
2255
|
+
config: rawProfile,
|
|
2256
|
+
});
|
|
2257
|
+
superegoEvolution = dispFallback?.text ?? null;
|
|
2258
|
+
} catch (error) {
|
|
2259
|
+
log(`[evaluationRunner] Superego disposition fallback also failed: ${error.message}`, 'warn');
|
|
2260
|
+
}
|
|
2261
|
+
}
|
|
2262
|
+
} else {
|
|
2263
|
+
const dispResult = group1Map['superego_disposition'];
|
|
2264
|
+
superegoEvolution = dispResult?.text ?? null;
|
|
2265
|
+
if (superegoEvolution) {
|
|
2266
|
+
log(`[evaluationRunner] Superego disposition rewriter generated evolution for turn ${turnIdx + 1}`, 'info');
|
|
2267
|
+
consolidatedTrace.push({
|
|
2268
|
+
agent: 'superego_disposition',
|
|
2269
|
+
action: 'rewrite',
|
|
2270
|
+
turnIndex: turnIdx,
|
|
2271
|
+
contextSummary: `Disposition evolution generated for turn ${turnIdx + 1}`,
|
|
2272
|
+
detail: superegoEvolution,
|
|
2273
|
+
metrics: dispResult?.metrics ?? null,
|
|
2274
|
+
timestamp: new Date().toISOString(),
|
|
2275
|
+
});
|
|
2276
|
+
}
|
|
2277
|
+
}
|
|
1581
2278
|
}
|
|
2279
|
+
|
|
2280
|
+
// Tutor profile of learner result
|
|
2281
|
+
if (otherEgoProfilingEnabled) {
|
|
2282
|
+
const tutorProfResult = group1Map['tutor_profile'];
|
|
2283
|
+
if (tutorProfResult?.text) {
|
|
2284
|
+
tutorProfileOfLearner = tutorProfResult.text;
|
|
2285
|
+
log(`[evaluationRunner] Tutor profile of learner generated for turn ${turnIdx + 1}`, 'info');
|
|
2286
|
+
consolidatedTrace.push({
|
|
2287
|
+
agent: 'tutor_other_ego',
|
|
2288
|
+
action: 'profile_learner',
|
|
2289
|
+
turnIndex: turnIdx,
|
|
2290
|
+
contextSummary: `Tutor built mental model of learner after turn ${turnIdx + 1}`,
|
|
2291
|
+
detail: tutorProfileOfLearner,
|
|
2292
|
+
metrics: tutorProfResult.metrics ?? null,
|
|
2293
|
+
timestamp: new Date().toISOString(),
|
|
2294
|
+
});
|
|
2295
|
+
}
|
|
2296
|
+
}
|
|
2297
|
+
|
|
2298
|
+
// Learner profile of tutor result
|
|
2299
|
+
if (otherEgoProfilingEnabled && otherEgoBidirectional) {
|
|
2300
|
+
const learnerProfResult = group1Map['learner_profile'];
|
|
2301
|
+
if (learnerProfResult?.text) {
|
|
2302
|
+
learnerProfileOfTutor = learnerProfResult.text;
|
|
2303
|
+
log(`[evaluationRunner] Learner profile of tutor generated for turn ${turnIdx + 1}`, 'info');
|
|
2304
|
+
consolidatedTrace.push({
|
|
2305
|
+
agent: 'learner_other_ego',
|
|
2306
|
+
action: 'profile_tutor',
|
|
2307
|
+
turnIndex: turnIdx,
|
|
2308
|
+
contextSummary: `Learner built mental model of tutor after turn ${turnIdx + 1}`,
|
|
2309
|
+
detail: learnerProfileOfTutor,
|
|
2310
|
+
metrics: learnerProfResult.metrics ?? null,
|
|
2311
|
+
timestamp: new Date().toISOString(),
|
|
2312
|
+
});
|
|
2313
|
+
}
|
|
2314
|
+
}
|
|
2315
|
+
|
|
2316
|
+
// ── Group 2: Dependent on group 1 results ──────────────────────
|
|
2317
|
+
const group2Promises = [];
|
|
2318
|
+
const group2Labels = [];
|
|
2319
|
+
|
|
2320
|
+
// Parse quantitative behavioral parameters (sync — no LLM call)
|
|
2321
|
+
if (quantitativeDispositionEnabled && superegoEvolution) {
|
|
2322
|
+
const parsed = promptRewriter.parseBehavioralParameters(superegoEvolution);
|
|
2323
|
+
if (parsed) {
|
|
2324
|
+
behavioralOverrides = parsed;
|
|
2325
|
+
log(`[evaluationRunner] Behavioral overrides parsed: threshold=${parsed.rejection_threshold}, max_rejections=${parsed.max_rejections}, priority=[${parsed.priority_criteria.join(',')}], deprioritized=[${parsed.deprioritized_criteria.join(',')}]`, 'info');
|
|
2326
|
+
consolidatedTrace.push({
|
|
2327
|
+
agent: 'behavioral_overrides',
|
|
2328
|
+
action: 'parse',
|
|
2329
|
+
turnIndex: turnIdx,
|
|
2330
|
+
contextSummary: `Quantitative behavioral params: threshold=${parsed.rejection_threshold}, max=${parsed.max_rejections}`,
|
|
2331
|
+
detail: JSON.stringify(parsed),
|
|
2332
|
+
timestamp: new Date().toISOString(),
|
|
2333
|
+
});
|
|
2334
|
+
} else {
|
|
2335
|
+
log(`[evaluationRunner] No behavioral parameters found in superego reflection for turn ${turnIdx + 1} (quantitative_disposition enabled but no <behavioral_parameters> block)`, 'warn');
|
|
2336
|
+
}
|
|
2337
|
+
}
|
|
2338
|
+
|
|
2339
|
+
// Intersubjective recognition (depends on ego + superego self-reflections)
|
|
2340
|
+
if (intersubjectiveEnabled && superegoEvolution) {
|
|
2341
|
+
group2Promises.push(
|
|
2342
|
+
promptRewriter.synthesizeEgoResponseToSuperego({
|
|
2343
|
+
superegoReflection: superegoEvolution,
|
|
2344
|
+
egoReflection: sessionEvolution,
|
|
2345
|
+
turnResults,
|
|
2346
|
+
conversationHistory,
|
|
2347
|
+
config: rawProfile,
|
|
2348
|
+
}).catch(error => {
|
|
2349
|
+
log(`[evaluationRunner] Intersubjective ego response failed: ${error.message}`, 'warn');
|
|
2350
|
+
return null;
|
|
2351
|
+
})
|
|
2352
|
+
);
|
|
2353
|
+
group2Labels.push('intersubjective');
|
|
2354
|
+
}
|
|
2355
|
+
|
|
2356
|
+
// Strategy planning (depends on tutor profile)
|
|
2357
|
+
if (strategyPlanningEnabled && tutorProfileOfLearner) {
|
|
2358
|
+
group2Promises.push(
|
|
2359
|
+
promptRewriter.synthesizeStrategyPlan({
|
|
2360
|
+
learnerProfile: tutorProfileOfLearner,
|
|
2361
|
+
turnResults,
|
|
2362
|
+
conversationHistory,
|
|
2363
|
+
config: rawProfile,
|
|
2364
|
+
}).catch(error => {
|
|
2365
|
+
log(`[evaluationRunner] Strategy plan failed: ${error.message}`, 'warn');
|
|
2366
|
+
return null;
|
|
2367
|
+
})
|
|
2368
|
+
);
|
|
2369
|
+
group2Labels.push('strategy');
|
|
2370
|
+
}
|
|
2371
|
+
|
|
2372
|
+
// Fire group 2 in parallel (intersubjective + strategy are independent of each other)
|
|
2373
|
+
if (group2Promises.length > 0) {
|
|
2374
|
+
const group2Results = await Promise.all(group2Promises);
|
|
2375
|
+
const group2Map = {};
|
|
2376
|
+
group2Labels.forEach((label, i) => { group2Map[label] = group2Results[i]; });
|
|
2377
|
+
|
|
2378
|
+
// Process intersubjective result
|
|
2379
|
+
if (group2Map['intersubjective']) {
|
|
2380
|
+
const egoResponseText = group2Map['intersubjective']?.text ?? null;
|
|
2381
|
+
if (egoResponseText) {
|
|
2382
|
+
sessionEvolution = sessionEvolution
|
|
2383
|
+
? sessionEvolution + '\n\n' + egoResponseText
|
|
2384
|
+
: egoResponseText;
|
|
2385
|
+
log(`[evaluationRunner] Intersubjective ego response to superego generated for turn ${turnIdx + 1}`, 'info');
|
|
2386
|
+
consolidatedTrace.push({
|
|
2387
|
+
agent: 'ego_intersubjective',
|
|
2388
|
+
action: 'respond_to_critic',
|
|
2389
|
+
turnIndex: turnIdx,
|
|
2390
|
+
contextSummary: `Ego responded to superego's self-reflection for turn ${turnIdx + 1}`,
|
|
2391
|
+
detail: egoResponseText,
|
|
2392
|
+
metrics: group2Map['intersubjective']?.metrics ?? null,
|
|
2393
|
+
timestamp: new Date().toISOString(),
|
|
2394
|
+
});
|
|
2395
|
+
}
|
|
2396
|
+
}
|
|
2397
|
+
|
|
2398
|
+
// Process strategy plan result
|
|
2399
|
+
if (group2Map['strategy']) {
|
|
2400
|
+
strategyPlan = group2Map['strategy']?.text ?? null;
|
|
2401
|
+
if (strategyPlan) {
|
|
2402
|
+
log(`[evaluationRunner] Strategy plan generated for turn ${turnIdx + 1}`, 'info');
|
|
2403
|
+
consolidatedTrace.push({
|
|
2404
|
+
agent: 'ego_strategy',
|
|
2405
|
+
action: 'plan',
|
|
2406
|
+
turnIndex: turnIdx,
|
|
2407
|
+
contextSummary: `Ego formulated strategy plan for turn ${turnIdx + 1}`,
|
|
2408
|
+
detail: strategyPlan,
|
|
2409
|
+
metrics: group2Map['strategy']?.metrics ?? null,
|
|
2410
|
+
timestamp: new Date().toISOString(),
|
|
2411
|
+
});
|
|
2412
|
+
}
|
|
2413
|
+
}
|
|
2414
|
+
}
|
|
2415
|
+
|
|
2416
|
+
const betweenTurnMs = Date.now() - betweenTurnStart;
|
|
2417
|
+
log(`[evaluationRunner] Between-turn processing completed in ${(betweenTurnMs / 1000).toFixed(1)}s (${group1Labels.length} parallel group-1, ${group2Labels.length} parallel group-2)`, 'info');
|
|
1582
2418
|
}
|
|
1583
2419
|
|
|
2420
|
+
// Flush transcript: reflections (self-reflection, disposition, profiling, etc.)
|
|
2421
|
+
flushTranscript();
|
|
2422
|
+
|
|
1584
2423
|
// Generate LLM learner response for next turn if ego_superego architecture
|
|
1585
2424
|
// Note: check includes() to handle both 'ego_superego' and 'ego_superego_recognition'
|
|
1586
2425
|
if (resolvedConfig.learnerArchitecture?.includes('ego_superego') && turnIdx < totalTurnCount - 1) {
|
|
@@ -1595,7 +2434,10 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
1595
2434
|
})),
|
|
1596
2435
|
learnerProfile: resolvedConfig.learnerArchitecture,
|
|
1597
2436
|
personaId: fullScenario.learner_persona || 'eager_novice',
|
|
1598
|
-
modelOverride: config.modelOverride || null,
|
|
2437
|
+
modelOverride: config.learnerModelOverride || config.modelOverride || null,
|
|
2438
|
+
profileContext: (otherEgoBidirectional && learnerProfileOfTutor)
|
|
2439
|
+
? promptRewriter.formatProfileForInjection(learnerProfileOfTutor, 'tutor')
|
|
2440
|
+
: null,
|
|
1599
2441
|
});
|
|
1600
2442
|
|
|
1601
2443
|
// Override scripted message with LLM-generated one
|
|
@@ -1633,10 +2475,32 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
1633
2475
|
}
|
|
1634
2476
|
|
|
1635
2477
|
log(`[evaluationRunner] Generated LLM learner response (ego_superego): "${learnerResponse.message.substring(0, 80)}..."`, 'info');
|
|
2478
|
+
|
|
2479
|
+
// Flush transcript: learner deliberation
|
|
2480
|
+
flushTranscript();
|
|
1636
2481
|
}
|
|
1637
2482
|
}
|
|
1638
2483
|
}
|
|
1639
2484
|
|
|
2485
|
+
// Clear turn progress from run metadata now that all turns are complete
|
|
2486
|
+
if (runId) {
|
|
2487
|
+
evaluationStore.updateRun(runId, {
|
|
2488
|
+
metadata: { turnProgress: null }
|
|
2489
|
+
});
|
|
2490
|
+
}
|
|
2491
|
+
|
|
2492
|
+
// Write complete transcript file at end (for post-hoc viewing)
|
|
2493
|
+
if (transcriptMode && transcriptPath) {
|
|
2494
|
+
const fullTranscript = formatTranscript(consolidatedTrace, {
|
|
2495
|
+
detail: 'play',
|
|
2496
|
+
scenarioName: fullScenario.name || scenario.id,
|
|
2497
|
+
profileName: config.profileName,
|
|
2498
|
+
totalTurns: turnResults.length,
|
|
2499
|
+
});
|
|
2500
|
+
fs.writeFileSync(transcriptPath, fullTranscript);
|
|
2501
|
+
log(`[evaluationRunner] Transcript written: ${transcriptPath}`, 'info');
|
|
2502
|
+
}
|
|
2503
|
+
|
|
1640
2504
|
// 5. Aggregate scores across turns
|
|
1641
2505
|
const validTurnScores = turnResults.filter(t => t.turnScore !== null).map(t => t.turnScore);
|
|
1642
2506
|
const overallScore = validTurnScores.length > 0
|
|
@@ -1792,7 +2656,7 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
1792
2656
|
superegoModel: resolvedConfig.superegoModel
|
|
1793
2657
|
? `${resolvedConfig.superegoModel.provider}.${resolvedConfig.superegoModel.model}`
|
|
1794
2658
|
: null,
|
|
1795
|
-
hyperparameters: config.hyperparameters,
|
|
2659
|
+
hyperparameters: resolvedConfig.hyperparameters || config.hyperparameters,
|
|
1796
2660
|
suggestions: turnResults.map(t => t.suggestion).filter(Boolean),
|
|
1797
2661
|
success: true,
|
|
1798
2662
|
latencyMs: totalLatencyMs,
|
|
@@ -1876,6 +2740,7 @@ export async function resumeEvaluation(options = {}) {
|
|
|
1876
2740
|
const runsPerConfig = metadata.runsPerConfig || 1;
|
|
1877
2741
|
const skipRubricEval = metadata.skipRubricEval || false;
|
|
1878
2742
|
const modelOverride = metadata.modelOverride || null;
|
|
2743
|
+
const learnerModelOverride = metadata.learnerModelOverride || null;
|
|
1879
2744
|
|
|
1880
2745
|
// 3. Get existing results for completion checking
|
|
1881
2746
|
const existingResults = evaluationStore.getResults(runId);
|
|
@@ -1917,10 +2782,13 @@ export async function resumeEvaluation(options = {}) {
|
|
|
1917
2782
|
label: name,
|
|
1918
2783
|
}));
|
|
1919
2784
|
|
|
1920
|
-
// 6. Re-apply
|
|
2785
|
+
// 6. Re-apply model overrides if present in metadata
|
|
1921
2786
|
if (modelOverride) {
|
|
1922
2787
|
targetConfigs = targetConfigs.map(c => ({ ...c, modelOverride }));
|
|
1923
2788
|
}
|
|
2789
|
+
if (learnerModelOverride) {
|
|
2790
|
+
targetConfigs = targetConfigs.map(c => ({ ...c, learnerModelOverride }));
|
|
2791
|
+
}
|
|
1924
2792
|
|
|
1925
2793
|
// 6. Count successful results per (profile, scenario) combo and fill up to runsPerConfig.
|
|
1926
2794
|
// Failed results are excluded so they get retried.
|
|
@@ -1971,6 +2839,7 @@ export async function resumeEvaluation(options = {}) {
|
|
|
1971
2839
|
console.log(` Profiles: ${profileNames.join(', ')}`);
|
|
1972
2840
|
console.log(` Scenarios: ${targetScenarios.length}`);
|
|
1973
2841
|
if (modelOverride) console.log(` Model override: ${modelOverride}`);
|
|
2842
|
+
if (learnerModelOverride) console.log(` Learner model override: ${learnerModelOverride}`);
|
|
1974
2843
|
|
|
1975
2844
|
// Initialize content resolver (same as runEvaluation)
|
|
1976
2845
|
const contentConfig = evalConfigLoader.getContentConfig();
|
|
@@ -2128,29 +2997,36 @@ export async function resumeEvaluation(options = {}) {
|
|
|
2128
2997
|
completedTests++;
|
|
2129
2998
|
log(` ${formatProgress(completedTests, totalRemainingTests, runStartTime)} ${profileLabel} / ${scenario.id}: ERROR - ${error.message}`);
|
|
2130
2999
|
|
|
2131
|
-
//
|
|
2132
|
-
const
|
|
2133
|
-
|
|
2134
|
-
|
|
2135
|
-
|
|
2136
|
-
|
|
2137
|
-
|
|
2138
|
-
|
|
2139
|
-
|
|
2140
|
-
: config.
|
|
2141
|
-
|
|
2142
|
-
|
|
2143
|
-
|
|
2144
|
-
|
|
2145
|
-
|
|
2146
|
-
|
|
2147
|
-
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
3000
|
+
// Only store failed results for permanent errors — skip transient/retriable ones
|
|
3001
|
+
const errMsg = error.message || '';
|
|
3002
|
+
const isTransient = /429|rate limit|too many requests|503|502|timeout|ECONNREFUSED|ECONNRESET|ETIMEDOUT|terminated|unavailable|failed to generate suggestions/i.test(errMsg);
|
|
3003
|
+
|
|
3004
|
+
if (!isTransient) {
|
|
3005
|
+
const failedResult = {
|
|
3006
|
+
scenarioId: scenario.id,
|
|
3007
|
+
scenarioName: scenario.name || scenario.id,
|
|
3008
|
+
profileName: config.profileName,
|
|
3009
|
+
provider: config.provider || config.ego?.provider || 'unknown',
|
|
3010
|
+
model: config.model || config.ego?.model || 'unknown',
|
|
3011
|
+
egoModel: config.egoModel
|
|
3012
|
+
? `${config.egoModel.provider}.${config.egoModel.model}`
|
|
3013
|
+
: config.ego ? `${config.ego.provider}.${config.ego.model}` : null,
|
|
3014
|
+
superegoModel: config.superegoModel
|
|
3015
|
+
? `${config.superegoModel.provider}.${config.superegoModel.model}`
|
|
3016
|
+
: config.superego ? `${config.superego.provider}.${config.superego.model}` : null,
|
|
3017
|
+
factors: config.factors || null,
|
|
3018
|
+
learnerArchitecture: config.learnerArchitecture || null,
|
|
3019
|
+
success: false,
|
|
3020
|
+
errorMessage: error.message,
|
|
3021
|
+
};
|
|
3022
|
+
try {
|
|
3023
|
+
evaluationStore.storeResult(runId, failedResult);
|
|
3024
|
+
results.push(failedResult);
|
|
3025
|
+
} catch (storeErr) {
|
|
3026
|
+
log(` [WARNING] Failed to store error result: ${storeErr.message}`);
|
|
3027
|
+
}
|
|
3028
|
+
} else {
|
|
3029
|
+
log(` [SKIPPED] Transient error, not storing empty row (resumable): ${errMsg.substring(0, 100)}`);
|
|
2154
3030
|
}
|
|
2155
3031
|
|
|
2156
3032
|
progressLogger.testError({
|
|
@@ -2287,6 +3163,7 @@ export async function quickTest(config, options = {}) {
|
|
|
2287
3163
|
onLog,
|
|
2288
3164
|
superegoStrategy = null, // Superego intervention strategy
|
|
2289
3165
|
judgeOverride = null, // Override judge model for this run
|
|
3166
|
+
dryRun = false,
|
|
2290
3167
|
} = options;
|
|
2291
3168
|
|
|
2292
3169
|
const scenarios = [evalConfigLoader.listScenarios().find(s => s.id === scenarioId)].filter(Boolean);
|
|
@@ -2294,7 +3171,7 @@ export async function quickTest(config, options = {}) {
|
|
|
2294
3171
|
throw new Error(`Scenario not found: ${scenarioId}`);
|
|
2295
3172
|
}
|
|
2296
3173
|
|
|
2297
|
-
const result = await runSingleTest(scenarios[0], config, { verbose, skipRubricEval, outputSize, onLog, superegoStrategy, judgeOverride });
|
|
3174
|
+
const result = await runSingleTest(scenarios[0], config, { verbose, skipRubricEval, outputSize, onLog, superegoStrategy, judgeOverride, dryRun });
|
|
2298
3175
|
return result;
|
|
2299
3176
|
}
|
|
2300
3177
|
|