@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/build.sh +44 -20
  19. package/docs/research/figures/figure10.png +0 -0
  20. package/docs/research/figures/figure11.png +0 -0
  21. package/docs/research/figures/figure3.png +0 -0
  22. package/docs/research/figures/figure4.png +0 -0
  23. package/docs/research/figures/figure5.png +0 -0
  24. package/docs/research/figures/figure6.png +0 -0
  25. package/docs/research/figures/figure7.png +0 -0
  26. package/docs/research/figures/figure8.png +0 -0
  27. package/docs/research/figures/figure9.png +0 -0
  28. package/docs/research/header.tex +23 -2
  29. package/docs/research/paper-full.md +941 -285
  30. package/docs/research/paper-short.md +216 -585
  31. package/docs/research/references.bib +132 -0
  32. package/docs/research/slides-header.tex +188 -0
  33. package/docs/research/slides-pptx.md +363 -0
  34. package/docs/research/slides.md +531 -0
  35. package/docs/research/style-reference-pptx.py +199 -0
  36. package/package.json +6 -5
  37. package/scripts/analyze-eval-results.js +69 -17
  38. package/scripts/analyze-mechanism-traces.js +763 -0
  39. package/scripts/analyze-modulation-learning.js +498 -0
  40. package/scripts/analyze-prosthesis.js +144 -0
  41. package/scripts/analyze-run.js +264 -79
  42. package/scripts/assess-transcripts.js +853 -0
  43. package/scripts/browse-transcripts.js +854 -0
  44. package/scripts/check-parse-failures.js +73 -0
  45. package/scripts/code-dialectical-modulation.js +1320 -0
  46. package/scripts/download-data.sh +55 -0
  47. package/scripts/eval-cli.js +106 -18
  48. package/scripts/generate-paper-figures.js +663 -0
  49. package/scripts/generate-paper-figures.py +577 -76
  50. package/scripts/generate-paper-tables.js +299 -0
  51. package/scripts/qualitative-analysis-ai.js +3 -3
  52. package/scripts/render-sequence-diagram.js +694 -0
  53. package/scripts/test-latency.js +210 -0
  54. package/scripts/test-rate-limit.js +95 -0
  55. package/scripts/test-token-budget.js +332 -0
  56. package/scripts/validate-paper-manifest.js +670 -0
  57. package/services/__tests__/evalConfigLoader.test.js +2 -2
  58. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  59. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  60. package/services/evaluationRunner.js +975 -98
  61. package/services/evaluationStore.js +12 -4
  62. package/services/learnerTutorInteractionEngine.js +27 -2
  63. package/services/mockProvider.js +133 -0
  64. package/services/promptRewriter.js +1471 -5
  65. package/services/rubricEvaluator.js +55 -2
  66. package/services/transcriptFormatter.js +675 -0
  67. package/docs/EVALUATION-VARIABLES.md +0 -589
  68. package/docs/REPLICATION-PLAN.md +0 -577
  69. package/scripts/analyze-run.mjs +0 -282
  70. package/scripts/compare-runs.js +0 -44
  71. package/scripts/compare-suggestions.js +0 -80
  72. package/scripts/dig-into-run.js +0 -158
  73. package/scripts/show-failed-suggestions.js +0 -64
  74. /package/scripts/{check-run.mjs → check-run.js} +0 -0
@@ -21,10 +21,13 @@ import { generateLearnerResponse } from './learnerTutorInteractionEngine.js';
21
21
  import * as turnComparisonAnalyzer from './turnComparisonAnalyzer.js';
22
22
  import * as dialogueTraceAnalyzer from './dialogueTraceAnalyzer.js';
23
23
  import * as promptRewriter from './promptRewriter.js';
24
+ import { mockGenerateResult, mockJudgeResult } from './mockProvider.js';
25
+ import { formatEntry, formatTranscript, formatCompactLine } from './transcriptFormatter.js';
24
26
 
25
27
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
26
28
  const EVAL_ROOT = path.resolve(__dirname, '..');
27
29
  const LOGS_DIR = path.join(EVAL_ROOT, 'logs', 'tutor-dialogues');
30
+ const TRANSCRIPTS_DIR = path.join(EVAL_ROOT, 'logs', 'transcripts');
28
31
 
29
32
  // Redirect tutor-core logs to this repo's logs/ directory (if available)
30
33
  import('@machinespirits/tutor-core').then(mod => {
@@ -68,6 +71,33 @@ const EVAL_ONLY_PROFILES = [
68
71
  'cell_17_placebo_multi_unified', 'cell_18_placebo_multi_psycho',
69
72
  'cell_19_memory_single_unified', 'cell_20_recog_nomem_single_unified',
70
73
  'cell_21_recog_multi_unified_rewrite',
74
+ 'cell_22_base_suspicious_unified', 'cell_23_recog_suspicious_unified',
75
+ 'cell_24_base_adversary_unified', 'cell_25_recog_adversary_unified',
76
+ 'cell_26_base_advocate_unified', 'cell_27_recog_advocate_unified',
77
+ 'cell_28_base_dialectical_suspicious_unified', 'cell_29_recog_dialectical_suspicious_unified',
78
+ 'cell_30_base_dialectical_adversary_unified', 'cell_31_recog_dialectical_adversary_unified',
79
+ 'cell_32_base_dialectical_advocate_unified', 'cell_33_recog_dialectical_advocate_unified',
80
+ 'cell_34_base_dialectical_suspicious_unified_full', 'cell_35_recog_dialectical_suspicious_unified_full',
81
+ 'cell_36_base_dialectical_adversary_unified_full', 'cell_37_recog_dialectical_adversary_unified_full',
82
+ 'cell_38_base_dialectical_advocate_unified_full', 'cell_39_recog_dialectical_advocate_unified_full',
83
+ 'cell_40_base_dialectical_suspicious_unified_superego', 'cell_41_recog_dialectical_suspicious_unified_superego',
84
+ 'cell_42_base_dialectical_adversary_unified_superego', 'cell_43_recog_dialectical_adversary_unified_superego',
85
+ 'cell_44_base_dialectical_advocate_unified_superego', 'cell_45_recog_dialectical_advocate_unified_superego',
86
+ 'cell_46_base_dialectical_suspicious_unified_quantitative', 'cell_47_recog_dialectical_suspicious_unified_quantitative',
87
+ 'cell_48_base_dialectical_suspicious_unified_erosion', 'cell_49_recog_dialectical_suspicious_unified_erosion',
88
+ 'cell_50_base_dialectical_suspicious_unified_intersubjective', 'cell_51_recog_dialectical_suspicious_unified_intersubjective',
89
+ 'cell_52_base_dialectical_suspicious_unified_combined', 'cell_53_recog_dialectical_suspicious_unified_combined',
90
+ 'cell_54_base_dialectical_profile_tutor', 'cell_55_recog_dialectical_profile_tutor',
91
+ 'cell_56_base_dialectical_profile_bidirectional', 'cell_57_recog_dialectical_profile_bidirectional',
92
+ 'cell_58_recog_dialectical_profile_bidirectional_full', 'cell_59_recog_dialectical_profile_bidirectional_strategy',
93
+ 'cell_60_base_dialectical_selfreflect_psycho', 'cell_61_recog_dialectical_selfreflect_psycho',
94
+ 'cell_62_base_dialectical_profile_bidirectional_psycho', 'cell_63_recog_dialectical_profile_bidirectional_psycho',
95
+ 'cell_64_recog_dialectical_intersubjective_psycho', 'cell_65_recog_dialectical_combined_psycho',
96
+ 'cell_66_recog_dialectical_profile_prosthesis_descriptive',
97
+ 'cell_67_recog_dialectical_profile_prosthesis_prescriptive',
98
+ 'cell_68_recog_dialectical_profile_prosthesis_adversary',
99
+ 'cell_69_base_dialectical_intersubjective_psycho', 'cell_70_base_dialectical_combined_psycho',
100
+ 'cell_71_naive_single_unified',
71
101
  ];
72
102
 
73
103
  /**
@@ -95,10 +125,24 @@ export function resolveEvalProfile(profileName) {
95
125
  resolvedProfileName = 'placebo';
96
126
  } else if (promptType === 'hardwired') {
97
127
  resolvedProfileName = 'hardwired';
128
+ } else if (promptType === 'naive') {
129
+ resolvedProfileName = 'naive';
98
130
  } else if (promptType === 'memory') {
99
131
  resolvedProfileName = 'memory';
100
132
  } else if (promptType === 'recognition_nomem') {
101
133
  resolvedProfileName = 'recognition_nomem';
134
+ } else if (promptType === 'divergent_suspicious') {
135
+ resolvedProfileName = recognitionMode ? 'suspicious_recognition' : 'suspicious';
136
+ } else if (promptType === 'divergent_adversary') {
137
+ resolvedProfileName = recognitionMode ? 'adversary_recognition' : 'adversary';
138
+ } else if (promptType === 'divergent_advocate') {
139
+ resolvedProfileName = recognitionMode ? 'advocate_recognition' : 'advocate';
140
+ } else if (promptType === 'dialectical_suspicious') {
141
+ resolvedProfileName = recognitionMode ? 'dialectical_suspicious_recognition' : 'dialectical_suspicious';
142
+ } else if (promptType === 'dialectical_adversary') {
143
+ resolvedProfileName = recognitionMode ? 'dialectical_adversary_recognition' : 'dialectical_adversary';
144
+ } else if (promptType === 'dialectical_advocate') {
145
+ resolvedProfileName = recognitionMode ? 'dialectical_advocate_recognition' : 'dialectical_advocate';
102
146
  } else if (recognitionMode) {
103
147
  resolvedProfileName = 'recognition';
104
148
  } else {
@@ -158,13 +202,23 @@ function resolveConfigModels(config) {
158
202
  // Extract factorial factor tags and learner architecture from profile
159
203
  const rawProfile = evalConfigLoader.loadTutorAgents()?.profiles?.[resolved.profileName];
160
204
  if (rawProfile?.factors) {
161
- resolved.factors = rawProfile.factors;
205
+ resolved.factors = { ...rawProfile.factors };
206
+ // Normalize prompt_type → recognition boolean for DB storage
207
+ if (resolved.factors.prompt_type && resolved.factors.recognition == null) {
208
+ resolved.factors.recognition = resolved.factors.prompt_type === 'recognition';
209
+ }
162
210
  }
163
211
  if (rawProfile?.learner_architecture) {
164
212
  resolved.learnerArchitecture = rawProfile.learner_architecture;
165
213
  }
166
214
  }
167
215
 
216
+ // Apply CLI --max-tokens override (overrides ego max_tokens hyperparameter)
217
+ if (config.maxTokensOverride) {
218
+ if (!resolved.hyperparameters) resolved.hyperparameters = {};
219
+ resolved.hyperparameters = { ...resolved.hyperparameters, max_tokens: config.maxTokensOverride };
220
+ }
221
+
168
222
  // Apply CLI --model override (replaces ego and superego models, preserves factorial metadata)
169
223
  if (config.modelOverride) {
170
224
  try {
@@ -433,6 +487,8 @@ function buildMultiTurnContext(options) {
433
487
  conversationHistory = [],
434
488
  currentTurn,
435
489
  previousSuggestion,
490
+ priorSuperegoAssessments = [],
491
+ learnerTrajectory = null,
436
492
  } = options;
437
493
 
438
494
  const contextParts = [];
@@ -449,6 +505,26 @@ function buildMultiTurnContext(options) {
449
505
  }
450
506
  }
451
507
 
508
+ // Cross-turn superego memory: accumulated feedback from prior turns' internal
509
+ // deliberation. Visible to both ego (full context) and superego (via
510
+ // extractStructuredSummary fallback). Enables the superego to detect whether
511
+ // its prior feedback was incorporated and escalate if needed.
512
+ if (priorSuperegoAssessments.length > 0) {
513
+ contextParts.push('\n### Prior Superego Assessment');
514
+ for (const assessment of priorSuperegoAssessments) {
515
+ contextParts.push(formatSuperegoAssessment(assessment));
516
+ }
517
+ }
518
+
519
+ // Structured learner trajectory: pre-processed resistance/engagement signals
520
+ // derived from conversation history and score trajectory. Enables the superego
521
+ // to distinguish "learner asked a new question" from "learner is repeating the
522
+ // same confusion because our approach isn't working."
523
+ if (learnerTrajectory) {
524
+ contextParts.push('\n### Learner Trajectory Assessment');
525
+ contextParts.push(formatLearnerTrajectory(learnerTrajectory));
526
+ }
527
+
452
528
  // Note: "Previous Tutor Suggestion" block removed — it duplicated the last
453
529
  // entry already present in conversation history above.
454
530
 
@@ -464,6 +540,210 @@ function buildMultiTurnContext(options) {
464
540
  return contextParts.join('\n');
465
541
  }
466
542
 
543
+ /**
544
+ * Extract superego feedback from a single turn's dialogue trace entries.
545
+ * Returns a structured assessment object for cross-turn memory.
546
+ */
547
+ function extractTurnSuperegoAssessment(turnIndex, traceEntries) {
548
+ const superegoEntries = traceEntries.filter(e => e.agent === 'superego');
549
+ if (superegoEntries.length === 0) return null;
550
+
551
+ const lastEntry = superegoEntries[superegoEntries.length - 1];
552
+ const totalRejections = superegoEntries.filter(e => e.approved === false).length;
553
+ const totalApprovals = superegoEntries.filter(e => e.approved === true).length;
554
+ const interventionTypes = superegoEntries
555
+ .map(e => e.interventionType)
556
+ .filter(Boolean);
557
+
558
+ // Extract feedback text from last entry
559
+ let feedbackText = lastEntry.feedback || '';
560
+ if (!feedbackText && lastEntry.detail) {
561
+ const match = lastEntry.detail.match(/"feedback"\s*:\s*"([^"]+)"/);
562
+ if (match) feedbackText = match[1];
563
+ }
564
+
565
+ return {
566
+ turnIndex,
567
+ rejections: totalRejections,
568
+ approvals: totalApprovals,
569
+ interventionTypes,
570
+ finalApproved: lastEntry.approved,
571
+ confidence: lastEntry.confidence,
572
+ feedback: feedbackText.substring(0, 300),
573
+ };
574
+ }
575
+
576
+ /**
577
+ * Format a superego assessment for context injection.
578
+ */
579
+ function formatSuperegoAssessment(assessment) {
580
+ const lines = [];
581
+ lines.push(`\n**Turn ${assessment.turnIndex + 1} internal critique:**`);
582
+ lines.push(`- Outcome: ${assessment.finalApproved ? 'approved' : 'rejected'} after ${assessment.rejections} rejection(s)`);
583
+ if (assessment.interventionTypes.length > 0) {
584
+ lines.push(`- Interventions: ${[...new Set(assessment.interventionTypes)].join(', ')}`);
585
+ }
586
+ if (assessment.feedback) {
587
+ lines.push(`- Key concern: "${assessment.feedback}"`);
588
+ }
589
+ return lines.join('\n');
590
+ }
591
+
592
+ /**
593
+ * Analyze learner trajectory across turns to produce structured resistance signals.
594
+ * Returns null if insufficient data.
595
+ */
596
+ function analyzeLearnerTrajectory(turnResults, conversationHistory) {
597
+ if (turnResults.length < 2) return null;
598
+
599
+ const trajectory = {
600
+ turnCount: turnResults.length,
601
+ engagementDirection: 'stable',
602
+ resistanceType: null,
603
+ resistanceStrength: 0, // 0-3 scale
604
+ priorApproachEffective: null,
605
+ scoreTrajectory: [],
606
+ messageLengthTrajectory: [],
607
+ repeatedConfusion: false,
608
+ questionDiversity: 0,
609
+ };
610
+
611
+ // Score trajectory
612
+ trajectory.scoreTrajectory = turnResults
613
+ .filter(t => t.turnScore != null)
614
+ .map(t => t.turnScore);
615
+
616
+ // Message length trajectory (proxy for engagement)
617
+ const messageLengths = conversationHistory
618
+ .filter(h => h.learnerMessage)
619
+ .map(h => h.learnerMessage.length);
620
+ trajectory.messageLengthTrajectory = messageLengths;
621
+
622
+ // Engagement direction: declining if last 2 messages shorter than first 2
623
+ if (messageLengths.length >= 3) {
624
+ const earlyAvg = messageLengths.slice(0, 2).reduce((a, b) => a + b, 0) / 2;
625
+ const lateAvg = messageLengths.slice(-2).reduce((a, b) => a + b, 0) / 2;
626
+ if (lateAvg < earlyAvg * 0.6) trajectory.engagementDirection = 'declining';
627
+ else if (lateAvg > earlyAvg * 1.4) trajectory.engagementDirection = 'increasing';
628
+ }
629
+
630
+ // Score direction
631
+ if (trajectory.scoreTrajectory.length >= 2) {
632
+ const last = trajectory.scoreTrajectory[trajectory.scoreTrajectory.length - 1];
633
+ const prev = trajectory.scoreTrajectory[trajectory.scoreTrajectory.length - 2];
634
+ trajectory.priorApproachEffective = last >= prev;
635
+ }
636
+
637
+ // Repeated confusion detection: learner uses similar phrasing across turns
638
+ const learnerMessages = conversationHistory
639
+ .filter(h => h.learnerMessage)
640
+ .map(h => h.learnerMessage.toLowerCase());
641
+
642
+ if (learnerMessages.length >= 2) {
643
+ // Check for confusion markers repeating
644
+ const confusionPatterns = [
645
+ /i('m| am) (still )?(confused|lost|not sure|unsure)/i,
646
+ /i don'?t (understand|get|see)/i,
647
+ /what do you mean/i,
648
+ /can you explain/i,
649
+ /i('m| am) not following/i,
650
+ ];
651
+
652
+ const confusionCounts = learnerMessages.map(msg =>
653
+ confusionPatterns.filter(p => p.test(msg)).length
654
+ );
655
+ const lastTwoConfusion = confusionCounts.slice(-2);
656
+ if (lastTwoConfusion.length >= 2 && lastTwoConfusion.every(c => c > 0)) {
657
+ trajectory.repeatedConfusion = true;
658
+ trajectory.resistanceType = 'repeated_confusion';
659
+ trajectory.resistanceStrength = 2;
660
+ }
661
+ }
662
+
663
+ // Pushback detection
664
+ const lastMessage = learnerMessages[learnerMessages.length - 1] || '';
665
+ const pushbackPatterns = [
666
+ /\bbut\s+(what about|doesn'?t|isn'?t|that doesn'?t)\b/i,
667
+ /\bi disagree\b/i,
668
+ /\bi don'?t think\b/i,
669
+ /\bthat'?s not (right|correct|what i)\b/i,
670
+ /\byou('re| are) (wrong|missing|not)\b/i,
671
+ ];
672
+ if (pushbackPatterns.some(p => p.test(lastMessage))) {
673
+ trajectory.resistanceType = trajectory.resistanceType || 'pushback';
674
+ trajectory.resistanceStrength = Math.max(trajectory.resistanceStrength, 2);
675
+ }
676
+
677
+ // Disengagement detection: very short messages, no questions
678
+ if (messageLengths.length >= 2) {
679
+ const lastLen = messageLengths[messageLengths.length - 1];
680
+ if (lastLen < 30 && !lastMessage.includes('?')) {
681
+ trajectory.resistanceType = trajectory.resistanceType || 'disengagement';
682
+ trajectory.resistanceStrength = Math.max(trajectory.resistanceStrength, 1);
683
+ trajectory.engagementDirection = 'declining';
684
+ }
685
+ }
686
+
687
+ // Question diversity: how varied are the learner's questions?
688
+ const questions = learnerMessages.filter(m => m.includes('?'));
689
+ if (questions.length >= 2) {
690
+ // Simple word overlap check between consecutive questions
691
+ const uniqueQuestionWords = questions.map(q => new Set(q.split(/\s+/).filter(w => w.length > 3)));
692
+ let totalOverlap = 0;
693
+ for (let i = 1; i < uniqueQuestionWords.length; i++) {
694
+ const prev = uniqueQuestionWords[i - 1];
695
+ const curr = uniqueQuestionWords[i];
696
+ const overlap = [...curr].filter(w => prev.has(w)).length / Math.max(curr.size, 1);
697
+ totalOverlap += overlap;
698
+ }
699
+ trajectory.questionDiversity = 1 - (totalOverlap / Math.max(uniqueQuestionWords.length - 1, 1));
700
+ }
701
+
702
+ // Cumulative resistance: if score declining AND engagement declining, high resistance
703
+ if (trajectory.engagementDirection === 'declining' && trajectory.priorApproachEffective === false) {
704
+ trajectory.resistanceStrength = 3;
705
+ trajectory.resistanceType = trajectory.resistanceType || 'cumulative_decline';
706
+ }
707
+
708
+ return trajectory;
709
+ }
710
+
711
+ /**
712
+ * Format learner trajectory assessment for context injection.
713
+ */
714
+ function formatLearnerTrajectory(trajectory) {
715
+ const lines = [];
716
+
717
+ // Engagement direction
718
+ const engagementEmoji = trajectory.engagementDirection === 'declining' ? 'DECLINING' :
719
+ trajectory.engagementDirection === 'increasing' ? 'INCREASING' : 'STABLE';
720
+ lines.push(`- Engagement: ${engagementEmoji} (over ${trajectory.turnCount} turns)`);
721
+
722
+ // Score trajectory
723
+ if (trajectory.scoreTrajectory.length >= 2) {
724
+ const scores = trajectory.scoreTrajectory.map(s => s.toFixed(0)).join(' → ');
725
+ lines.push(`- Score trajectory: ${scores}`);
726
+ lines.push(`- Prior approach effective: ${trajectory.priorApproachEffective ? 'YES' : 'NO'}`);
727
+ }
728
+
729
+ // Resistance
730
+ if (trajectory.resistanceType) {
731
+ const strengthLabel = ['none', 'mild', 'moderate', 'strong'][trajectory.resistanceStrength] || 'unknown';
732
+ lines.push(`- Resistance detected: ${trajectory.resistanceType} (${strengthLabel})`);
733
+ }
734
+
735
+ // Specific signals
736
+ if (trajectory.repeatedConfusion) {
737
+ lines.push(`- WARNING: Learner expressed confusion in consecutive turns — prior explanation did not land`);
738
+ }
739
+
740
+ if (trajectory.questionDiversity < 0.3 && trajectory.turnCount >= 3) {
741
+ lines.push(`- WARNING: Learner questions show low diversity — they may be stuck on the same concept`);
742
+ }
743
+
744
+ return lines.join('\n');
745
+ }
746
+
467
747
  /**
468
748
  * Format a previous turn for inclusion in context
469
749
  */
@@ -649,26 +929,69 @@ async function generateAndEvaluateTurn(context, resolvedConfig, turnMeta, option
649
929
  log = () => {},
650
930
  scenarioId = '',
651
931
  systemPromptExtension = null,
932
+ superegoPromptExtension = null, // Dynamic disposition adjustments for superego
652
933
  learnerId = null, // For Writing Pad memory persistence
934
+ dialecticalNegotiation = false, // Phase 2: AI-powered dialectical struggle
935
+ behavioralOverrides = null, // Quantitative params from superego self-reflection
936
+ dryRun = false,
653
937
  } = options;
654
938
 
939
+ // Dry-run mode: return canned results without any API calls
940
+ if (dryRun) {
941
+ log('[dry-run] Generating mock suggestions (no API call)', 'info');
942
+ const genResult = mockGenerateResult(resolvedConfig, turnMeta);
943
+ const suggestion = genResult.suggestions?.[0];
944
+ const validation = suggestion
945
+ ? rubricEvaluator.quickValidate(suggestion, {
946
+ requiredElements: turnMeta.requiredElements,
947
+ requiredElementsAny: turnMeta.requiredElementsAny,
948
+ forbiddenElements: turnMeta.forbiddenElements,
949
+ })
950
+ : { passesRequired: false, passesForbidden: true, requiredMissing: ['No suggestions generated'] };
951
+
952
+ let rubricResult = null;
953
+ let turnScore = null;
954
+ let scoringMethod = 'skipped';
955
+ if (!skipRubricEval && suggestion) {
956
+ log('[dry-run] Generating mock judge scores (no API call)', 'info');
957
+ rubricResult = mockJudgeResult(resolvedConfig, scenarioId + Date.now());
958
+ turnScore = rubricResult.overallScore;
959
+ scoringMethod = 'rubric';
960
+ }
961
+
962
+ return { genResult, suggestion, validation, rubricResult, turnScore, scoringMethod };
963
+ }
964
+
655
965
  // Generate suggestions via tutor API with retry logic
966
+ // Note: retryWithBackoff handles thrown errors, but tutorApi.generateSuggestions()
967
+ // catches its own errors and returns { success: false }. We need to also handle
968
+ // 429 rate limit errors returned in the result (not thrown).
656
969
  const genResult = await retryWithBackoff(
657
- () => tutorApi.generateSuggestions(context, {
658
- provider: resolvedConfig.provider,
659
- model: resolvedConfig.model,
660
- egoModel: resolvedConfig.egoModel,
661
- superegoModel: resolvedConfig.superegoModel || null,
662
- profileName: resolvedConfig.profileName,
663
- hyperparameters: resolvedConfig.hyperparameters || {},
664
- trace: true,
665
- superegoStrategy,
666
- outputSize,
667
- useDialogue,
668
- maxRounds,
669
- systemPromptExtension,
670
- learnerId, // Activates Writing Pad three-layer memory
671
- }),
970
+ async () => {
971
+ const result = await tutorApi.generateSuggestions(context, {
972
+ provider: resolvedConfig.provider,
973
+ model: resolvedConfig.model,
974
+ egoModel: resolvedConfig.egoModel,
975
+ superegoModel: resolvedConfig.superegoModel || null,
976
+ profileName: resolvedConfig.profileName,
977
+ hyperparameters: resolvedConfig.hyperparameters || {},
978
+ trace: true,
979
+ superegoStrategy,
980
+ outputSize,
981
+ useDialogue,
982
+ maxRounds,
983
+ systemPromptExtension,
984
+ superegoPromptExtension, // Dynamic disposition adjustments for superego
985
+ learnerId, // Activates Writing Pad three-layer memory
986
+ dialecticalNegotiation, // Phase 2: AI-powered dialectical struggle
987
+ behavioralOverrides, // Quantitative params from superego self-reflection
988
+ });
989
+ // Re-throw 429 errors so retryWithBackoff can handle them
990
+ if (!result.success && result.error && (result.error.includes('429') || result.error.toLowerCase().includes('rate limit'))) {
991
+ throw new Error(result.error);
992
+ }
993
+ return result;
994
+ },
672
995
  { log }
673
996
  );
674
997
 
@@ -776,9 +1099,13 @@ export async function runEvaluation(options = {}) {
776
1099
  description = null,
777
1100
  verbose = false,
778
1101
  scenarioFilter = null, // Cluster filter: 'single-turn', 'multi-turn', or category names
779
- modelOverride = null, // CLI --model override (e.g. "openrouter.nemotron")
780
- egoModelOverride = null, // CLI --ego-model override (replaces only ego model)
781
- superegoModelOverride = null, // CLI --superego-model override (replaces only superego model)
1102
+ modelOverride = null, // CLI --model override (e.g. "openrouter.nemotron") — ALL agents
1103
+ egoModelOverride = null, // CLI --ego-model override (replaces only tutor ego model)
1104
+ superegoModelOverride = null, // CLI --superego-model override (replaces only tutor superego model)
1105
+ learnerModelOverride = null, // CLI --learner-model override (replaces all learner agent models)
1106
+ dryRun = false, // Use mock data instead of API calls
1107
+ transcriptMode = false, // Write play-format transcript files during multi-turn runs
1108
+ maxTokensOverride = null, // CLI --max-tokens override (replaces ego max_tokens hyperparameter)
782
1109
  } = options;
783
1110
 
784
1111
  const log = verbose ? console.log : () => {};
@@ -856,6 +1183,7 @@ export async function runEvaluation(options = {}) {
856
1183
  const effectiveModelOverride = modelOverride || yamlOverrides.modelOverride;
857
1184
  const effectiveEgoModelOverride = egoModelOverride || yamlOverrides.egoModelOverride;
858
1185
  const effectiveSuperegoModelOverride = superegoModelOverride || yamlOverrides.superegoModelOverride;
1186
+ const effectiveLearnerModelOverride = learnerModelOverride || null;
859
1187
 
860
1188
  if (effectiveModelOverride) {
861
1189
  targetConfigs = targetConfigs.map(c => ({ ...c, modelOverride: effectiveModelOverride }));
@@ -866,6 +1194,12 @@ export async function runEvaluation(options = {}) {
866
1194
  if (effectiveSuperegoModelOverride) {
867
1195
  targetConfigs = targetConfigs.map(c => ({ ...c, superegoModelOverride: effectiveSuperegoModelOverride }));
868
1196
  }
1197
+ if (effectiveLearnerModelOverride) {
1198
+ targetConfigs = targetConfigs.map(c => ({ ...c, learnerModelOverride: effectiveLearnerModelOverride }));
1199
+ }
1200
+ if (maxTokensOverride) {
1201
+ targetConfigs = targetConfigs.map(c => ({ ...c, maxTokensOverride }));
1202
+ }
869
1203
 
870
1204
  if (targetConfigs.length === 0) {
871
1205
  throw new Error('No configurations to test');
@@ -888,6 +1222,8 @@ export async function runEvaluation(options = {}) {
888
1222
  modelOverride: effectiveModelOverride || null,
889
1223
  egoModelOverride: effectiveEgoModelOverride || null,
890
1224
  superegoModelOverride: effectiveSuperegoModelOverride || null,
1225
+ learnerModelOverride: effectiveLearnerModelOverride || null,
1226
+ maxTokensOverride: maxTokensOverride || null,
891
1227
  // Store scenario IDs and profile names for accurate resume
892
1228
  scenarioIds: targetScenarios.map(s => s.id),
893
1229
  profileNames: targetConfigs.map(c => c.profileName).filter(Boolean),
@@ -1002,6 +1338,9 @@ export async function runEvaluation(options = {}) {
1002
1338
  const result = await runSingleTest(scenario, config, {
1003
1339
  skipRubricEval,
1004
1340
  verbose,
1341
+ dryRun,
1342
+ transcriptMode,
1343
+ runId: run.id,
1005
1344
  });
1006
1345
 
1007
1346
  // Store result (better-sqlite3 is synchronous, thread-safe for concurrent writes)
@@ -1071,30 +1410,38 @@ export async function runEvaluation(options = {}) {
1071
1410
  completedTests++;
1072
1411
  log(` ${formatProgress(completedTests, totalTests, runStartTime)} ${profileLabel} / ${scenario.id}: ERROR - ${error.message}`);
1073
1412
 
1074
- // Store failed result so it shows up in the database instead of silently disappearing
1075
- // Extract provider/model from nested ego config if not at top level (profile-based configs)
1076
- const failedResult = {
1077
- scenarioId: scenario.id,
1078
- scenarioName: scenario.name || scenario.id,
1079
- profileName: config.profileName,
1080
- provider: config.provider || config.ego?.provider || 'unknown',
1081
- model: config.model || config.ego?.model || 'unknown',
1082
- egoModel: config.egoModel
1083
- ? `${config.egoModel.provider}.${config.egoModel.model}`
1084
- : config.ego ? `${config.ego.provider}.${config.ego.model}` : null,
1085
- superegoModel: config.superegoModel
1086
- ? `${config.superegoModel.provider}.${config.superegoModel.model}`
1087
- : config.superego ? `${config.superego.provider}.${config.superego.model}` : null,
1088
- factors: config.factors || null,
1089
- learnerArchitecture: config.learnerArchitecture || null,
1090
- success: false,
1091
- errorMessage: error.message,
1092
- };
1093
- try {
1094
- evaluationStore.storeResult(run.id, failedResult);
1095
- results.push(failedResult);
1096
- } catch (storeErr) {
1097
- log(` [WARNING] Failed to store error result: ${storeErr.message}`);
1413
+ // Only store failed results for permanent errors (bad config, invalid scenario).
1414
+ // Skip storing for retriable/transient errors (rate limits, model unavailable, timeouts)
1415
+ // so that `resume` can retry them without needing manual cleanup.
1416
+ const errMsg = error.message || '';
1417
+ const isTransient = /429|rate limit|too many requests|503|502|timeout|ECONNREFUSED|ECONNRESET|ETIMEDOUT|terminated|unavailable|failed to generate suggestions/i.test(errMsg);
1418
+
1419
+ if (!isTransient) {
1420
+ const failedResult = {
1421
+ scenarioId: scenario.id,
1422
+ scenarioName: scenario.name || scenario.id,
1423
+ profileName: config.profileName,
1424
+ provider: config.provider || config.ego?.provider || 'unknown',
1425
+ model: config.model || config.ego?.model || 'unknown',
1426
+ egoModel: config.egoModel
1427
+ ? `${config.egoModel.provider}.${config.egoModel.model}`
1428
+ : config.ego ? `${config.ego.provider}.${config.ego.model}` : null,
1429
+ superegoModel: config.superegoModel
1430
+ ? `${config.superegoModel.provider}.${config.superegoModel.model}`
1431
+ : config.superego ? `${config.superego.provider}.${config.superego.model}` : null,
1432
+ factors: config.factors || null,
1433
+ learnerArchitecture: config.learnerArchitecture || null,
1434
+ success: false,
1435
+ errorMessage: error.message,
1436
+ };
1437
+ try {
1438
+ evaluationStore.storeResult(run.id, failedResult);
1439
+ results.push(failedResult);
1440
+ } catch (storeErr) {
1441
+ log(` [WARNING] Failed to store error result: ${storeErr.message}`);
1442
+ }
1443
+ } else {
1444
+ log(` [SKIPPED] Transient error, not storing empty row (resumable): ${errMsg.substring(0, 100)}`);
1098
1445
  }
1099
1446
 
1100
1447
  // Emit test_error event
@@ -1183,7 +1530,7 @@ export async function runEvaluation(options = {}) {
1183
1530
  * Handles both single-turn and multi-turn scenarios
1184
1531
  */
1185
1532
  async function runSingleTest(scenario, config, options = {}) {
1186
- const { skipRubricEval = false, outputSize = 'normal', verbose = false, onLog, superegoStrategy = null, judgeOverride = null } = options;
1533
+ const { skipRubricEval = false, outputSize = 'normal', verbose = false, onLog, superegoStrategy = null, judgeOverride = null, dryRun = false } = options;
1187
1534
 
1188
1535
  // Create a log function that calls both console and onLog callback
1189
1536
  const log = (message, level = 'info') => {
@@ -1214,7 +1561,7 @@ async function runSingleTest(scenario, config, options = {}) {
1214
1561
  * Run a single-turn test
1215
1562
  */
1216
1563
  async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
1217
- const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null, judgeOverride = null } = options;
1564
+ const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null, judgeOverride = null, dryRun = false } = options;
1218
1565
 
1219
1566
  // Resolve model aliases through eval's providers.yaml
1220
1567
  const resolvedConfig = resolveConfigModels(config);
@@ -1260,7 +1607,7 @@ async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
1260
1607
  requiredElementsAny: fullScenario.required_elements_any,
1261
1608
  forbiddenElements: fullScenario.forbidden_elements,
1262
1609
  },
1263
- { skipRubricEval, outputSize, superegoStrategy, judgeOverride, useDialogue, maxRounds, log, scenarioId: scenario.id }
1610
+ { skipRubricEval, outputSize, superegoStrategy, judgeOverride, useDialogue, maxRounds, log, scenarioId: scenario.id, dryRun }
1264
1611
  );
1265
1612
 
1266
1613
  if (!genResult.success) {
@@ -1296,7 +1643,7 @@ async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
1296
1643
  superegoModel: resolvedConfig.superegoModel
1297
1644
  ? `${resolvedConfig.superegoModel.provider}.${resolvedConfig.superegoModel.model}`
1298
1645
  : null,
1299
- hyperparameters: config.hyperparameters,
1646
+ hyperparameters: resolvedConfig.hyperparameters || config.hyperparameters,
1300
1647
  suggestions: genResult.suggestions,
1301
1648
  success: true,
1302
1649
  latencyMs: genResult.metadata?.latencyMs,
@@ -1346,7 +1693,7 @@ async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
1346
1693
  * This eliminates the separate multiTurnRunner orchestration.
1347
1694
  */
1348
1695
  async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
1349
- const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null, judgeOverride = null } = options;
1696
+ const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null, judgeOverride = null, dryRun = false, transcriptMode = false, runId = null } = options;
1350
1697
 
1351
1698
  log(`[evaluationRunner] Running multi-turn scenario: ${scenario.id}`);
1352
1699
 
@@ -1371,6 +1718,41 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
1371
1718
  const learnerId = `eval-learner-${dialogueId}-${scenario.id.replace(/[^a-zA-Z0-9]/g, '')}`;
1372
1719
  log(`[evaluationRunner] Generated learnerId for Writing Pad: ${learnerId}`, 'info');
1373
1720
 
1721
+ // Set up transcript file for incremental writing (tail -f friendly)
1722
+ let transcriptPath = null;
1723
+ if (transcriptMode) {
1724
+ const effectiveRunId = runId || 'live';
1725
+ const transcriptDir = path.join(TRANSCRIPTS_DIR, effectiveRunId);
1726
+ if (!fs.existsSync(transcriptDir)) fs.mkdirSync(transcriptDir, { recursive: true });
1727
+ const safeName = `${config.profileName}--${scenario.id}`.replace(/[^a-zA-Z0-9_-]/g, '_');
1728
+ transcriptPath = path.join(transcriptDir, `${safeName}.txt`);
1729
+ // Write header
1730
+ const totalTurnCount = 1 + (fullScenario.turns || []).length;
1731
+ const header = `\n${(fullScenario.name || scenario.id).toUpperCase()} (${totalTurnCount}-turn)\n${config.profileName}\n${'─'.repeat(40)}\n\n`;
1732
+ fs.writeFileSync(transcriptPath, header);
1733
+ log(`[evaluationRunner] Transcript: ${transcriptPath}`, 'info');
1734
+ }
1735
+
1736
+ // Helper: append new trace entries to transcript file and optionally console
1737
+ let lastTranscriptIdx = 0;
1738
+ function flushTranscript() {
1739
+ if (!transcriptMode || !transcriptPath) return;
1740
+ const newEntries = consolidatedTrace.slice(lastTranscriptIdx);
1741
+ if (newEntries.length === 0) return;
1742
+ lastTranscriptIdx = consolidatedTrace.length;
1743
+ const lines = [];
1744
+ for (const entry of newEntries) {
1745
+ const formatted = formatEntry(entry, { detail: 'play' });
1746
+ if (formatted) lines.push(formatted + '\n');
1747
+ // Also print compact line to console in transcript mode
1748
+ const compactLine = formatCompactLine(entry);
1749
+ if (compactLine) console.log(compactLine);
1750
+ }
1751
+ if (lines.length > 0) {
1752
+ fs.appendFileSync(transcriptPath, lines.join('\n'));
1753
+ }
1754
+ }
1755
+
1374
1756
  // Deep-clone turns to prevent mutation of shared scenario objects across profiles
1375
1757
  const turns = JSON.parse(JSON.stringify(fullScenario.turns || []));
1376
1758
  const turnResults = [];
@@ -1384,14 +1766,58 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
1384
1766
  let conversationHistory = [];
1385
1767
  let previousSuggestion = null;
1386
1768
  const consolidatedTrace = [];
1769
+ const priorSuperegoAssessments = []; // Cross-turn superego memory
1387
1770
 
1388
- const sharedTurnOptions = { skipRubricEval, outputSize, superegoStrategy, judgeOverride, useDialogue, maxRounds, log, scenarioId: scenario.id, learnerId };
1389
-
1390
- // Check if prompt rewriting is enabled for this profile
1771
+ // Check profile-level feature flags
1391
1772
  const rawProfile = evalConfigLoader.loadTutorAgents()?.profiles?.[config.profileName];
1773
+
1774
+ // Apply CLI model override to rawProfile so prompt rewriter calls use the correct model.
1775
+ // Without this, --model/--ego-model only affects tutor-core's generateSuggestions,
1776
+ // while promptRewriter functions (self-reflection, profiling, etc.) still use the YAML model.
1777
+ if (config.modelOverride || config.egoModelOverride) {
1778
+ const overrideModel = config.egoModelOverride || config.modelOverride;
1779
+ try {
1780
+ const r = evalConfigLoader.resolveModel(overrideModel);
1781
+ if (rawProfile?.ego) {
1782
+ rawProfile.ego = { ...rawProfile.ego, provider: r.provider, model: r.model };
1783
+ }
1784
+ // Also update top-level model for functions that read config.model
1785
+ if (rawProfile) rawProfile.model = r.model;
1786
+ } catch { /* leave rawProfile as-is if resolution fails */ }
1787
+ }
1788
+ if (config.modelOverride || config.superegoModelOverride) {
1789
+ const overrideModel = config.superegoModelOverride || config.modelOverride;
1790
+ try {
1791
+ const r = evalConfigLoader.resolveModel(overrideModel);
1792
+ if (rawProfile?.superego) {
1793
+ rawProfile.superego = { ...rawProfile.superego, provider: r.provider, model: r.model };
1794
+ }
1795
+ } catch { /* leave rawProfile as-is if resolution fails */ }
1796
+ }
1797
+
1798
+ const dialecticalNegotiation = rawProfile?.dialectical_negotiation ?? false;
1392
1799
  const promptRewritingEnabled = rawProfile?.prompt_rewriting?.enabled ?? false;
1393
1800
  const promptRewritingStrategy = rawProfile?.prompt_rewriting?.strategy ?? 'template';
1801
+ const superegoDispositionRewriting = rawProfile?.superego_disposition_rewriting ?? false;
1802
+ const quantitativeDispositionEnabled = rawProfile?.prompt_rewriting?.quantitative_disposition ?? false;
1803
+ const promptErosionEnabled = rawProfile?.prompt_rewriting?.prompt_erosion?.enabled ?? false;
1804
+ const intersubjectiveEnabled = rawProfile?.prompt_rewriting?.intersubjective ?? false;
1805
+ const otherEgoProfilingEnabled = rawProfile?.other_ego_profiling?.enabled ?? false;
1806
+ const otherEgoBidirectional = rawProfile?.other_ego_profiling?.bidirectional ?? false;
1807
+ const strategyPlanningEnabled = rawProfile?.other_ego_profiling?.strategy_planning ?? false;
1808
+
1809
+ const sharedTurnOptions = { skipRubricEval, outputSize, superegoStrategy, judgeOverride, useDialogue, maxRounds, log, scenarioId: scenario.id, learnerId, dialecticalNegotiation, dryRun };
1394
1810
  let sessionEvolution = null;
1811
+ let superegoEvolution = null;
1812
+ let behavioralOverrides = null; // Parsed quantitative params from superego self-reflection
1813
+ let tutorProfileOfLearner = null; // Other-ego: tutor's mental model of learner
1814
+ let learnerProfileOfTutor = null; // Other-ego: learner's mental model of tutor
1815
+ let strategyPlan = null; // Other-ego: ego's explicit strategy plan
1816
+
1817
+ // Per-dialogue rejection budget: limits total superego rejections across all turns
1818
+ // to prevent worst-case cascade (e.g., 3 rejections × 5 turns = 15 total)
1819
+ let rejectionBudget = rawProfile?.dialogue?.rejection_budget ?? null; // null = unlimited (backwards-compatible)
1820
+ let totalRejections = 0;
1395
1821
 
1396
1822
  // 4. Loop through turns (initial turn 0 + follow-up turns)
1397
1823
  const totalTurnCount = 1 + turns.length;
@@ -1401,6 +1827,19 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
1401
1827
 
1402
1828
  log(`[evaluationRunner] Turn ${turnIdx}/${totalTurnCount - 1}${isInitialTurn ? ' (initial)' : ` (${turnDef.id})`}`, 'info');
1403
1829
 
1830
+ // Update run metadata with current turn progress for `runs` command
1831
+ if (runId) {
1832
+ evaluationStore.updateRun(runId, {
1833
+ metadata: {
1834
+ turnProgress: {
1835
+ current: turnIdx + 1,
1836
+ total: totalTurnCount,
1837
+ scenarioId: scenario.id,
1838
+ }
1839
+ }
1840
+ });
1841
+ }
1842
+
1404
1843
  // Show learner action in transcript mode (for follow-up turns)
1405
1844
  if (!isInitialTurn && dialogueEngine.isTranscriptMode()) {
1406
1845
  dialogueEngine.transcript('LEARNER ACTION', formatLearnerActionForTranscript(turnDef));
@@ -1420,11 +1859,16 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
1420
1859
  learnerMessage: turnDef.action_details?.message,
1421
1860
  });
1422
1861
 
1862
+ // Build learner trajectory assessment from accumulated turn data
1863
+ const learnerTrajectory = analyzeLearnerTrajectory(turnResults, conversationHistory);
1864
+
1423
1865
  contextStr = buildMultiTurnContext({
1424
1866
  originalContext: fullScenario.learner_context,
1425
1867
  conversationHistory,
1426
1868
  currentTurn: turnDef,
1427
1869
  previousSuggestion,
1870
+ priorSuperegoAssessments,
1871
+ learnerTrajectory,
1428
1872
  });
1429
1873
  }
1430
1874
 
@@ -1455,11 +1899,46 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
1455
1899
  : (turnDef.forbidden_elements || []),
1456
1900
  };
1457
1901
 
1902
+ // Build the ego prompt extension: erosion frame + session evolution (reflections)
1903
+ let fullEgoExtension = sessionEvolution;
1904
+ if (promptErosionEnabled && turnIdx > 0) {
1905
+ const erosionFrame = promptRewriter.buildPromptErosionFrame(turnIdx, rawProfile);
1906
+ if (erosionFrame) {
1907
+ // Erosion frame goes BEFORE reflections, so the model sees authority calibration first
1908
+ fullEgoExtension = erosionFrame + (sessionEvolution ? '\n\n' + sessionEvolution : '');
1909
+ log(`[evaluationRunner] Prompt erosion frame applied for turn ${turnIdx} (rate=${rawProfile.prompt_rewriting?.prompt_erosion?.rate ?? 0.2})`, 'info');
1910
+ }
1911
+ }
1912
+
1913
+ // Append other-ego profile and strategy plan to ego extension
1914
+ // Injection order: erosion frame → self-reflection → other-ego profile → strategy plan
1915
+ if (otherEgoProfilingEnabled && tutorProfileOfLearner) {
1916
+ const profileBlock = promptRewriter.formatProfileForInjection(tutorProfileOfLearner, 'learner');
1917
+ fullEgoExtension = (fullEgoExtension ? fullEgoExtension + '\n\n' : '') + profileBlock;
1918
+ }
1919
+ if (strategyPlanningEnabled && strategyPlan) {
1920
+ fullEgoExtension = (fullEgoExtension ? fullEgoExtension + '\n\n' : '') + strategyPlan;
1921
+ }
1922
+
1923
+ // Build the superego prompt extension: erosion frame + superego evolution (reflections)
1924
+ let fullSuperegoExtension = superegoEvolution;
1925
+ if (promptErosionEnabled && turnIdx > 0 && superegoEvolution) {
1926
+ const erosionFrame = promptRewriter.buildPromptErosionFrame(turnIdx, rawProfile);
1927
+ if (erosionFrame) {
1928
+ fullSuperegoExtension = erosionFrame + '\n\n' + superegoEvolution;
1929
+ }
1930
+ }
1931
+
1458
1932
  // Call the SAME generation+evaluation code path as single-turn
1459
1933
  // Pass dialogue context so the judge can see the full exchange
1934
+ // When rejection budget is exhausted, also skip outer superego review loop (maxRounds: 0)
1935
+ const budgetExhausted = rejectionBudget !== null && totalRejections >= rejectionBudget;
1460
1936
  const turnOptions = {
1461
1937
  ...sharedTurnOptions,
1462
- ...(sessionEvolution ? { systemPromptExtension: sessionEvolution } : {}),
1938
+ ...(fullEgoExtension ? { systemPromptExtension: fullEgoExtension } : {}),
1939
+ ...(fullSuperegoExtension ? { superegoPromptExtension: fullSuperegoExtension } : {}),
1940
+ ...(behavioralOverrides ? { behavioralOverrides } : {}),
1941
+ ...(budgetExhausted ? { maxRounds: 0 } : {}),
1463
1942
  conversationHistory: conversationHistory.length > 0 ? conversationHistory : null,
1464
1943
  consolidatedTrace: consolidatedTrace.length > 0 ? consolidatedTrace : null,
1465
1944
  };
@@ -1468,7 +1947,7 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
1468
1947
 
1469
1948
  if (!genResult.success) {
1470
1949
  const turnId = isInitialTurn ? 'initial' : turnDef.id;
1471
- throw new Error(`Multi-turn scenario ${scenario.id}: Turn ${turnIdx} (${turnId}) failed to generate suggestions`);
1950
+ throw new Error(`Multi-turn scenario ${scenario.id}: Turn ${turnIdx} (${turnId}) failed to generate suggestions: ${genResult.error || 'unknown error'}`);
1472
1951
  }
1473
1952
 
1474
1953
  // Accumulate dialogue traces
@@ -1506,6 +1985,39 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
1506
1985
  }
1507
1986
  }
1508
1987
 
1988
+ // Flush transcript: ego/superego exchange for this turn
1989
+ flushTranscript();
1990
+
1991
+ // Accumulate cross-turn superego memory from this turn's trace
1992
+ if (genResult.dialogueTrace && genResult.dialogueTrace.length > 0) {
1993
+ const assessment = extractTurnSuperegoAssessment(turnIdx, genResult.dialogueTrace);
1994
+ if (assessment) {
1995
+ priorSuperegoAssessments.push(assessment);
1996
+ }
1997
+ }
1998
+
1999
+ // Track rejection budget across turns: count superego rejections in this turn's trace
2000
+ if (rejectionBudget !== null && genResult.dialogueTrace) {
2001
+ const turnRejections = genResult.dialogueTrace.filter(
2002
+ entry => entry.agent === 'superego' && entry.action === 'review' && entry.approved === false
2003
+ ).length;
2004
+ totalRejections += turnRejections;
2005
+
2006
+ if (totalRejections >= rejectionBudget) {
2007
+ // Budget exhausted: force approve-only mode for remaining turns
2008
+ behavioralOverrides = { ...(behavioralOverrides || {}), max_rejections: 0 };
2009
+ log(`[evaluationRunner] Rejection budget exhausted (${totalRejections}/${rejectionBudget}): forcing approve-only for remaining turns`, 'info');
2010
+ consolidatedTrace.push({
2011
+ agent: 'rejection_budget',
2012
+ action: 'exhausted',
2013
+ turnIndex: turnIdx,
2014
+ contextSummary: `Budget exhausted: ${totalRejections}/${rejectionBudget} rejections used`,
2015
+ detail: `Total rejections across ${turnIdx + 1} turns: ${totalRejections}. Remaining turns will auto-approve.`,
2016
+ timestamp: new Date().toISOString(),
2017
+ });
2018
+ }
2019
+ }
2020
+
1509
2021
  // Collect per-turn result
1510
2022
  turnResults.push({
1511
2023
  turnIndex: turnIdx,
@@ -1546,41 +2058,368 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
1546
2058
  // Update for next iteration
1547
2059
  previousSuggestion = suggestion;
1548
2060
 
1549
- // Synthesize prompt rewriting directives for next turn (if enabled)
1550
- if (promptRewritingEnabled && turnIdx < totalTurnCount - 1) {
1551
- if (promptRewritingStrategy === 'llm') {
1552
- // LLM-based directive synthesis using superego model
1553
- try {
1554
- sessionEvolution = await promptRewriter.synthesizeDirectivesLLM({
2061
+ // ── Between-turn processing ──────────────────────────────────────────
2062
+ // Parallelized into groups by dependency:
2063
+ // Group 1 (independent): ego self-refl, superego self-refl, tutor profile, learner profile
2064
+ // Group 2 (depends on group 1): intersubjective, quantitative parse, strategy plan
2065
+ // Group 3 (depends on group 2): learner generation
2066
+ // This collapses ~6-8 sequential LLM calls into ~3 parallel rounds.
2067
+
2068
+ if (turnIdx < totalTurnCount - 1) {
2069
+ const betweenTurnStart = Date.now();
2070
+
2071
+ // ── Group 1: Independent LLM calls in parallel ──────────────────
2072
+ const group1Promises = [];
2073
+ const group1Labels = [];
2074
+
2075
+ // Ego self-reflection / prompt rewriting
2076
+ if (promptRewritingEnabled) {
2077
+ if (promptRewritingStrategy === 'self_reflection') {
2078
+ group1Promises.push(
2079
+ promptRewriter.synthesizeEgoSelfReflection({
2080
+ turnResults,
2081
+ consolidatedTrace,
2082
+ conversationHistory,
2083
+ config: rawProfile,
2084
+ }).catch(error => {
2085
+ log(`[evaluationRunner] Ego self-reflection failed, will fall back to template: ${error.message}`, 'warn');
2086
+ return null;
2087
+ })
2088
+ );
2089
+ group1Labels.push('ego_self_reflection');
2090
+ } else if (promptRewritingStrategy === 'llm') {
2091
+ group1Promises.push(
2092
+ promptRewriter.synthesizeDirectivesLLM({
2093
+ turnResults,
2094
+ consolidatedTrace,
2095
+ conversationHistory,
2096
+ config: rawProfile,
2097
+ }).catch(error => {
2098
+ log(`[evaluationRunner] LLM rewriter failed, will fall back to template: ${error.message}`, 'warn');
2099
+ return null;
2100
+ })
2101
+ );
2102
+ group1Labels.push('llm_rewrite');
2103
+ }
2104
+ }
2105
+
2106
+ // Superego self-reflection / disposition rewriting
2107
+ if (superegoDispositionRewriting) {
2108
+ if (promptRewritingStrategy === 'self_reflection') {
2109
+ group1Promises.push(
2110
+ promptRewriter.synthesizeSupergoSelfReflection({
2111
+ turnResults,
2112
+ consolidatedTrace,
2113
+ conversationHistory,
2114
+ priorSuperegoAssessments,
2115
+ config: rawProfile,
2116
+ }).catch(error => {
2117
+ log(`[evaluationRunner] Superego self-reflection failed: ${error.message}`, 'warn');
2118
+ return null;
2119
+ })
2120
+ );
2121
+ group1Labels.push('superego_self_reflection');
2122
+ } else {
2123
+ group1Promises.push(
2124
+ promptRewriter.synthesizeSuperegoDisposition({
2125
+ turnResults,
2126
+ consolidatedTrace,
2127
+ conversationHistory,
2128
+ priorSuperegoAssessments,
2129
+ config: rawProfile,
2130
+ }).catch(error => {
2131
+ log(`[evaluationRunner] Superego disposition rewriting failed: ${error.message}`, 'warn');
2132
+ return null;
2133
+ })
2134
+ );
2135
+ group1Labels.push('superego_disposition');
2136
+ }
2137
+ }
2138
+
2139
+ // Tutor profiles learner (Theory of Mind)
2140
+ if (otherEgoProfilingEnabled) {
2141
+ group1Promises.push(
2142
+ promptRewriter.synthesizeTutorProfileOfLearner({
1555
2143
  turnResults,
1556
2144
  consolidatedTrace,
1557
2145
  conversationHistory,
2146
+ priorProfile: tutorProfileOfLearner,
1558
2147
  config: rawProfile,
1559
- });
2148
+ }).catch(error => {
2149
+ log(`[evaluationRunner] Tutor profile of learner failed: ${error.message}`, 'warn');
2150
+ return null;
2151
+ })
2152
+ );
2153
+ group1Labels.push('tutor_profile');
2154
+ }
2155
+
2156
+ // Learner profiles tutor (bidirectional Theory of Mind)
2157
+ if (otherEgoProfilingEnabled && otherEgoBidirectional) {
2158
+ group1Promises.push(
2159
+ promptRewriter.synthesizeLearnerProfileOfTutor({
2160
+ turnResults,
2161
+ consolidatedTrace,
2162
+ conversationHistory,
2163
+ priorProfile: learnerProfileOfTutor,
2164
+ config: rawProfile,
2165
+ }).catch(error => {
2166
+ log(`[evaluationRunner] Learner profile of tutor failed: ${error.message}`, 'warn');
2167
+ return null;
2168
+ })
2169
+ );
2170
+ group1Labels.push('learner_profile');
2171
+ }
2172
+
2173
+ // Fire all group 1 calls in parallel
2174
+ const group1Results = await Promise.all(group1Promises);
2175
+ const group1Map = {};
2176
+ group1Labels.forEach((label, i) => { group1Map[label] = group1Results[i]; });
2177
+
2178
+ // ── Process group 1 results ─────────────────────────────────────
2179
+
2180
+ // Ego self-reflection / prompt rewriting result
2181
+ if (promptRewritingEnabled) {
2182
+ if (promptRewritingStrategy === 'self_reflection') {
2183
+ const egoReflResult = group1Map['ego_self_reflection'];
2184
+ sessionEvolution = egoReflResult?.text ?? null;
2185
+ if (sessionEvolution) {
2186
+ log(`[evaluationRunner] Ego self-reflection generated for turn ${turnIdx + 1}`, 'info');
2187
+ consolidatedTrace.push({
2188
+ agent: 'ego_self_reflection',
2189
+ action: 'rewrite',
2190
+ turnIndex: turnIdx,
2191
+ contextSummary: `Ego self-reflection generated for turn ${turnIdx + 1}`,
2192
+ detail: sessionEvolution,
2193
+ metrics: egoReflResult?.metrics ?? null,
2194
+ timestamp: new Date().toISOString(),
2195
+ });
2196
+ } else {
2197
+ log(`[evaluationRunner] Ego self-reflection returned empty, falling back to template for turn ${turnIdx + 1}`, 'warn');
2198
+ sessionEvolution = promptRewriter.synthesizeDirectives({
2199
+ turnResults,
2200
+ consolidatedTrace,
2201
+ conversationHistory,
2202
+ });
2203
+ }
2204
+ } else if (promptRewritingStrategy === 'llm') {
2205
+ const llmResult = group1Map['llm_rewrite'];
2206
+ sessionEvolution = llmResult?.text ?? null;
1560
2207
  if (sessionEvolution) {
1561
2208
  log(`[evaluationRunner] LLM rewriter generated directives for turn ${turnIdx + 1}`, 'info');
2209
+ } else {
2210
+ log(`[evaluationRunner] LLM rewriter returned empty, falling back to template for turn ${turnIdx + 1}`, 'warn');
2211
+ sessionEvolution = promptRewriter.synthesizeDirectives({
2212
+ turnResults,
2213
+ consolidatedTrace,
2214
+ conversationHistory,
2215
+ });
1562
2216
  }
1563
- } catch (error) {
1564
- log(`[evaluationRunner] LLM rewriter failed, falling back to template: ${error.message}`, 'warn');
2217
+ } else {
2218
+ // Template-based directive synthesis (deterministic, no LLM call)
1565
2219
  sessionEvolution = promptRewriter.synthesizeDirectives({
1566
2220
  turnResults,
1567
2221
  consolidatedTrace,
1568
2222
  conversationHistory,
1569
2223
  });
1570
2224
  }
1571
- } else {
1572
- // Template-based directive synthesis (deterministic, no LLM call)
1573
- sessionEvolution = promptRewriter.synthesizeDirectives({
1574
- turnResults,
1575
- consolidatedTrace,
1576
- conversationHistory,
1577
- });
2225
+ if (sessionEvolution) {
2226
+ log(`[evaluationRunner] Prompt rewriter (${promptRewritingStrategy}) generated ${sessionEvolution.split('\n').length - 2} directives for turn ${turnIdx + 1}`, 'info');
2227
+ }
1578
2228
  }
1579
- if (sessionEvolution) {
1580
- log(`[evaluationRunner] Prompt rewriter (${promptRewritingStrategy}) generated ${sessionEvolution.split('\n').length - 2} directives for turn ${turnIdx + 1}`, 'info');
2229
+
2230
+ // Superego self-reflection / disposition result
2231
+ if (superegoDispositionRewriting) {
2232
+ if (promptRewritingStrategy === 'self_reflection') {
2233
+ const seReflResult = group1Map['superego_self_reflection'];
2234
+ superegoEvolution = seReflResult?.text ?? null;
2235
+ if (superegoEvolution) {
2236
+ log(`[evaluationRunner] Superego self-reflection generated for turn ${turnIdx + 1}`, 'info');
2237
+ consolidatedTrace.push({
2238
+ agent: 'superego_self_reflection',
2239
+ action: 'rewrite',
2240
+ turnIndex: turnIdx,
2241
+ contextSummary: `Superego self-reflection generated for turn ${turnIdx + 1}`,
2242
+ detail: superegoEvolution,
2243
+ metrics: seReflResult?.metrics ?? null,
2244
+ timestamp: new Date().toISOString(),
2245
+ });
2246
+ } else {
2247
+ // Self-reflection returned empty — fall back to LLM disposition rewriting
2248
+ log(`[evaluationRunner] Superego self-reflection returned empty, falling back to LLM disposition for turn ${turnIdx + 1}`, 'warn');
2249
+ try {
2250
+ const dispFallback = await promptRewriter.synthesizeSuperegoDisposition({
2251
+ turnResults,
2252
+ consolidatedTrace,
2253
+ conversationHistory,
2254
+ priorSuperegoAssessments,
2255
+ config: rawProfile,
2256
+ });
2257
+ superegoEvolution = dispFallback?.text ?? null;
2258
+ } catch (error) {
2259
+ log(`[evaluationRunner] Superego disposition fallback also failed: ${error.message}`, 'warn');
2260
+ }
2261
+ }
2262
+ } else {
2263
+ const dispResult = group1Map['superego_disposition'];
2264
+ superegoEvolution = dispResult?.text ?? null;
2265
+ if (superegoEvolution) {
2266
+ log(`[evaluationRunner] Superego disposition rewriter generated evolution for turn ${turnIdx + 1}`, 'info');
2267
+ consolidatedTrace.push({
2268
+ agent: 'superego_disposition',
2269
+ action: 'rewrite',
2270
+ turnIndex: turnIdx,
2271
+ contextSummary: `Disposition evolution generated for turn ${turnIdx + 1}`,
2272
+ detail: superegoEvolution,
2273
+ metrics: dispResult?.metrics ?? null,
2274
+ timestamp: new Date().toISOString(),
2275
+ });
2276
+ }
2277
+ }
1581
2278
  }
2279
+
2280
+ // Tutor profile of learner result
2281
+ if (otherEgoProfilingEnabled) {
2282
+ const tutorProfResult = group1Map['tutor_profile'];
2283
+ if (tutorProfResult?.text) {
2284
+ tutorProfileOfLearner = tutorProfResult.text;
2285
+ log(`[evaluationRunner] Tutor profile of learner generated for turn ${turnIdx + 1}`, 'info');
2286
+ consolidatedTrace.push({
2287
+ agent: 'tutor_other_ego',
2288
+ action: 'profile_learner',
2289
+ turnIndex: turnIdx,
2290
+ contextSummary: `Tutor built mental model of learner after turn ${turnIdx + 1}`,
2291
+ detail: tutorProfileOfLearner,
2292
+ metrics: tutorProfResult.metrics ?? null,
2293
+ timestamp: new Date().toISOString(),
2294
+ });
2295
+ }
2296
+ }
2297
+
2298
+ // Learner profile of tutor result
2299
+ if (otherEgoProfilingEnabled && otherEgoBidirectional) {
2300
+ const learnerProfResult = group1Map['learner_profile'];
2301
+ if (learnerProfResult?.text) {
2302
+ learnerProfileOfTutor = learnerProfResult.text;
2303
+ log(`[evaluationRunner] Learner profile of tutor generated for turn ${turnIdx + 1}`, 'info');
2304
+ consolidatedTrace.push({
2305
+ agent: 'learner_other_ego',
2306
+ action: 'profile_tutor',
2307
+ turnIndex: turnIdx,
2308
+ contextSummary: `Learner built mental model of tutor after turn ${turnIdx + 1}`,
2309
+ detail: learnerProfileOfTutor,
2310
+ metrics: learnerProfResult.metrics ?? null,
2311
+ timestamp: new Date().toISOString(),
2312
+ });
2313
+ }
2314
+ }
2315
+
2316
+ // ── Group 2: Dependent on group 1 results ──────────────────────
2317
+ const group2Promises = [];
2318
+ const group2Labels = [];
2319
+
2320
+ // Parse quantitative behavioral parameters (sync — no LLM call)
2321
+ if (quantitativeDispositionEnabled && superegoEvolution) {
2322
+ const parsed = promptRewriter.parseBehavioralParameters(superegoEvolution);
2323
+ if (parsed) {
2324
+ behavioralOverrides = parsed;
2325
+ log(`[evaluationRunner] Behavioral overrides parsed: threshold=${parsed.rejection_threshold}, max_rejections=${parsed.max_rejections}, priority=[${parsed.priority_criteria.join(',')}], deprioritized=[${parsed.deprioritized_criteria.join(',')}]`, 'info');
2326
+ consolidatedTrace.push({
2327
+ agent: 'behavioral_overrides',
2328
+ action: 'parse',
2329
+ turnIndex: turnIdx,
2330
+ contextSummary: `Quantitative behavioral params: threshold=${parsed.rejection_threshold}, max=${parsed.max_rejections}`,
2331
+ detail: JSON.stringify(parsed),
2332
+ timestamp: new Date().toISOString(),
2333
+ });
2334
+ } else {
2335
+ log(`[evaluationRunner] No behavioral parameters found in superego reflection for turn ${turnIdx + 1} (quantitative_disposition enabled but no <behavioral_parameters> block)`, 'warn');
2336
+ }
2337
+ }
2338
+
2339
+ // Intersubjective recognition (depends on ego + superego self-reflections)
2340
+ if (intersubjectiveEnabled && superegoEvolution) {
2341
+ group2Promises.push(
2342
+ promptRewriter.synthesizeEgoResponseToSuperego({
2343
+ superegoReflection: superegoEvolution,
2344
+ egoReflection: sessionEvolution,
2345
+ turnResults,
2346
+ conversationHistory,
2347
+ config: rawProfile,
2348
+ }).catch(error => {
2349
+ log(`[evaluationRunner] Intersubjective ego response failed: ${error.message}`, 'warn');
2350
+ return null;
2351
+ })
2352
+ );
2353
+ group2Labels.push('intersubjective');
2354
+ }
2355
+
2356
+ // Strategy planning (depends on tutor profile)
2357
+ if (strategyPlanningEnabled && tutorProfileOfLearner) {
2358
+ group2Promises.push(
2359
+ promptRewriter.synthesizeStrategyPlan({
2360
+ learnerProfile: tutorProfileOfLearner,
2361
+ turnResults,
2362
+ conversationHistory,
2363
+ config: rawProfile,
2364
+ }).catch(error => {
2365
+ log(`[evaluationRunner] Strategy plan failed: ${error.message}`, 'warn');
2366
+ return null;
2367
+ })
2368
+ );
2369
+ group2Labels.push('strategy');
2370
+ }
2371
+
2372
+ // Fire group 2 in parallel (intersubjective + strategy are independent of each other)
2373
+ if (group2Promises.length > 0) {
2374
+ const group2Results = await Promise.all(group2Promises);
2375
+ const group2Map = {};
2376
+ group2Labels.forEach((label, i) => { group2Map[label] = group2Results[i]; });
2377
+
2378
+ // Process intersubjective result
2379
+ if (group2Map['intersubjective']) {
2380
+ const egoResponseText = group2Map['intersubjective']?.text ?? null;
2381
+ if (egoResponseText) {
2382
+ sessionEvolution = sessionEvolution
2383
+ ? sessionEvolution + '\n\n' + egoResponseText
2384
+ : egoResponseText;
2385
+ log(`[evaluationRunner] Intersubjective ego response to superego generated for turn ${turnIdx + 1}`, 'info');
2386
+ consolidatedTrace.push({
2387
+ agent: 'ego_intersubjective',
2388
+ action: 'respond_to_critic',
2389
+ turnIndex: turnIdx,
2390
+ contextSummary: `Ego responded to superego's self-reflection for turn ${turnIdx + 1}`,
2391
+ detail: egoResponseText,
2392
+ metrics: group2Map['intersubjective']?.metrics ?? null,
2393
+ timestamp: new Date().toISOString(),
2394
+ });
2395
+ }
2396
+ }
2397
+
2398
+ // Process strategy plan result
2399
+ if (group2Map['strategy']) {
2400
+ strategyPlan = group2Map['strategy']?.text ?? null;
2401
+ if (strategyPlan) {
2402
+ log(`[evaluationRunner] Strategy plan generated for turn ${turnIdx + 1}`, 'info');
2403
+ consolidatedTrace.push({
2404
+ agent: 'ego_strategy',
2405
+ action: 'plan',
2406
+ turnIndex: turnIdx,
2407
+ contextSummary: `Ego formulated strategy plan for turn ${turnIdx + 1}`,
2408
+ detail: strategyPlan,
2409
+ metrics: group2Map['strategy']?.metrics ?? null,
2410
+ timestamp: new Date().toISOString(),
2411
+ });
2412
+ }
2413
+ }
2414
+ }
2415
+
2416
+ const betweenTurnMs = Date.now() - betweenTurnStart;
2417
+ log(`[evaluationRunner] Between-turn processing completed in ${(betweenTurnMs / 1000).toFixed(1)}s (${group1Labels.length} parallel group-1, ${group2Labels.length} parallel group-2)`, 'info');
1582
2418
  }
1583
2419
 
2420
+ // Flush transcript: reflections (self-reflection, disposition, profiling, etc.)
2421
+ flushTranscript();
2422
+
1584
2423
  // Generate LLM learner response for next turn if ego_superego architecture
1585
2424
  // Note: check includes() to handle both 'ego_superego' and 'ego_superego_recognition'
1586
2425
  if (resolvedConfig.learnerArchitecture?.includes('ego_superego') && turnIdx < totalTurnCount - 1) {
@@ -1595,7 +2434,10 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
1595
2434
  })),
1596
2435
  learnerProfile: resolvedConfig.learnerArchitecture,
1597
2436
  personaId: fullScenario.learner_persona || 'eager_novice',
1598
- modelOverride: config.modelOverride || null,
2437
+ modelOverride: config.learnerModelOverride || config.modelOverride || null,
2438
+ profileContext: (otherEgoBidirectional && learnerProfileOfTutor)
2439
+ ? promptRewriter.formatProfileForInjection(learnerProfileOfTutor, 'tutor')
2440
+ : null,
1599
2441
  });
1600
2442
 
1601
2443
  // Override scripted message with LLM-generated one
@@ -1633,10 +2475,32 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
1633
2475
  }
1634
2476
 
1635
2477
  log(`[evaluationRunner] Generated LLM learner response (ego_superego): "${learnerResponse.message.substring(0, 80)}..."`, 'info');
2478
+
2479
+ // Flush transcript: learner deliberation
2480
+ flushTranscript();
1636
2481
  }
1637
2482
  }
1638
2483
  }
1639
2484
 
2485
+ // Clear turn progress from run metadata now that all turns are complete
2486
+ if (runId) {
2487
+ evaluationStore.updateRun(runId, {
2488
+ metadata: { turnProgress: null }
2489
+ });
2490
+ }
2491
+
2492
+ // Write complete transcript file at end (for post-hoc viewing)
2493
+ if (transcriptMode && transcriptPath) {
2494
+ const fullTranscript = formatTranscript(consolidatedTrace, {
2495
+ detail: 'play',
2496
+ scenarioName: fullScenario.name || scenario.id,
2497
+ profileName: config.profileName,
2498
+ totalTurns: turnResults.length,
2499
+ });
2500
+ fs.writeFileSync(transcriptPath, fullTranscript);
2501
+ log(`[evaluationRunner] Transcript written: ${transcriptPath}`, 'info');
2502
+ }
2503
+
1640
2504
  // 5. Aggregate scores across turns
1641
2505
  const validTurnScores = turnResults.filter(t => t.turnScore !== null).map(t => t.turnScore);
1642
2506
  const overallScore = validTurnScores.length > 0
@@ -1792,7 +2656,7 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
1792
2656
  superegoModel: resolvedConfig.superegoModel
1793
2657
  ? `${resolvedConfig.superegoModel.provider}.${resolvedConfig.superegoModel.model}`
1794
2658
  : null,
1795
- hyperparameters: config.hyperparameters,
2659
+ hyperparameters: resolvedConfig.hyperparameters || config.hyperparameters,
1796
2660
  suggestions: turnResults.map(t => t.suggestion).filter(Boolean),
1797
2661
  success: true,
1798
2662
  latencyMs: totalLatencyMs,
@@ -1876,6 +2740,7 @@ export async function resumeEvaluation(options = {}) {
1876
2740
  const runsPerConfig = metadata.runsPerConfig || 1;
1877
2741
  const skipRubricEval = metadata.skipRubricEval || false;
1878
2742
  const modelOverride = metadata.modelOverride || null;
2743
+ const learnerModelOverride = metadata.learnerModelOverride || null;
1879
2744
 
1880
2745
  // 3. Get existing results for completion checking
1881
2746
  const existingResults = evaluationStore.getResults(runId);
@@ -1917,10 +2782,13 @@ export async function resumeEvaluation(options = {}) {
1917
2782
  label: name,
1918
2783
  }));
1919
2784
 
1920
- // 6. Re-apply modelOverride if present in metadata
2785
+ // 6. Re-apply model overrides if present in metadata
1921
2786
  if (modelOverride) {
1922
2787
  targetConfigs = targetConfigs.map(c => ({ ...c, modelOverride }));
1923
2788
  }
2789
+ if (learnerModelOverride) {
2790
+ targetConfigs = targetConfigs.map(c => ({ ...c, learnerModelOverride }));
2791
+ }
1924
2792
 
1925
2793
  // 6. Count successful results per (profile, scenario) combo and fill up to runsPerConfig.
1926
2794
  // Failed results are excluded so they get retried.
@@ -1971,6 +2839,7 @@ export async function resumeEvaluation(options = {}) {
1971
2839
  console.log(` Profiles: ${profileNames.join(', ')}`);
1972
2840
  console.log(` Scenarios: ${targetScenarios.length}`);
1973
2841
  if (modelOverride) console.log(` Model override: ${modelOverride}`);
2842
+ if (learnerModelOverride) console.log(` Learner model override: ${learnerModelOverride}`);
1974
2843
 
1975
2844
  // Initialize content resolver (same as runEvaluation)
1976
2845
  const contentConfig = evalConfigLoader.getContentConfig();
@@ -2128,29 +2997,36 @@ export async function resumeEvaluation(options = {}) {
2128
2997
  completedTests++;
2129
2998
  log(` ${formatProgress(completedTests, totalRemainingTests, runStartTime)} ${profileLabel} / ${scenario.id}: ERROR - ${error.message}`);
2130
2999
 
2131
- // Store failed result so it shows up in the database
2132
- const failedResult = {
2133
- scenarioId: scenario.id,
2134
- scenarioName: scenario.name || scenario.id,
2135
- profileName: config.profileName,
2136
- provider: config.provider || config.ego?.provider || 'unknown',
2137
- model: config.model || config.ego?.model || 'unknown',
2138
- egoModel: config.egoModel
2139
- ? `${config.egoModel.provider}.${config.egoModel.model}`
2140
- : config.ego ? `${config.ego.provider}.${config.ego.model}` : null,
2141
- superegoModel: config.superegoModel
2142
- ? `${config.superegoModel.provider}.${config.superegoModel.model}`
2143
- : config.superego ? `${config.superego.provider}.${config.superego.model}` : null,
2144
- factors: config.factors || null,
2145
- learnerArchitecture: config.learnerArchitecture || null,
2146
- success: false,
2147
- errorMessage: error.message,
2148
- };
2149
- try {
2150
- evaluationStore.storeResult(runId, failedResult);
2151
- results.push(failedResult);
2152
- } catch (storeErr) {
2153
- log(` [WARNING] Failed to store error result: ${storeErr.message}`);
3000
+ // Only store failed results for permanent errors skip transient/retriable ones
3001
+ const errMsg = error.message || '';
3002
+ const isTransient = /429|rate limit|too many requests|503|502|timeout|ECONNREFUSED|ECONNRESET|ETIMEDOUT|terminated|unavailable|failed to generate suggestions/i.test(errMsg);
3003
+
3004
+ if (!isTransient) {
3005
+ const failedResult = {
3006
+ scenarioId: scenario.id,
3007
+ scenarioName: scenario.name || scenario.id,
3008
+ profileName: config.profileName,
3009
+ provider: config.provider || config.ego?.provider || 'unknown',
3010
+ model: config.model || config.ego?.model || 'unknown',
3011
+ egoModel: config.egoModel
3012
+ ? `${config.egoModel.provider}.${config.egoModel.model}`
3013
+ : config.ego ? `${config.ego.provider}.${config.ego.model}` : null,
3014
+ superegoModel: config.superegoModel
3015
+ ? `${config.superegoModel.provider}.${config.superegoModel.model}`
3016
+ : config.superego ? `${config.superego.provider}.${config.superego.model}` : null,
3017
+ factors: config.factors || null,
3018
+ learnerArchitecture: config.learnerArchitecture || null,
3019
+ success: false,
3020
+ errorMessage: error.message,
3021
+ };
3022
+ try {
3023
+ evaluationStore.storeResult(runId, failedResult);
3024
+ results.push(failedResult);
3025
+ } catch (storeErr) {
3026
+ log(` [WARNING] Failed to store error result: ${storeErr.message}`);
3027
+ }
3028
+ } else {
3029
+ log(` [SKIPPED] Transient error, not storing empty row (resumable): ${errMsg.substring(0, 100)}`);
2154
3030
  }
2155
3031
 
2156
3032
  progressLogger.testError({
@@ -2287,6 +3163,7 @@ export async function quickTest(config, options = {}) {
2287
3163
  onLog,
2288
3164
  superegoStrategy = null, // Superego intervention strategy
2289
3165
  judgeOverride = null, // Override judge model for this run
3166
+ dryRun = false,
2290
3167
  } = options;
2291
3168
 
2292
3169
  const scenarios = [evalConfigLoader.listScenarios().find(s => s.id === scenarioId)].filter(Boolean);
@@ -2294,7 +3171,7 @@ export async function quickTest(config, options = {}) {
2294
3171
  throw new Error(`Scenario not found: ${scenarioId}`);
2295
3172
  }
2296
3173
 
2297
- const result = await runSingleTest(scenarios[0], config, { verbose, skipRubricEval, outputSize, onLog, superegoStrategy, judgeOverride });
3174
+ const result = await runSingleTest(scenarios[0], config, { verbose, skipRubricEval, outputSize, onLog, superegoStrategy, judgeOverride, dryRun });
2298
3175
  return result;
2299
3176
  }
2300
3177