@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/build.sh +44 -20
  19. package/docs/research/figures/figure10.png +0 -0
  20. package/docs/research/figures/figure11.png +0 -0
  21. package/docs/research/figures/figure3.png +0 -0
  22. package/docs/research/figures/figure4.png +0 -0
  23. package/docs/research/figures/figure5.png +0 -0
  24. package/docs/research/figures/figure6.png +0 -0
  25. package/docs/research/figures/figure7.png +0 -0
  26. package/docs/research/figures/figure8.png +0 -0
  27. package/docs/research/figures/figure9.png +0 -0
  28. package/docs/research/header.tex +23 -2
  29. package/docs/research/paper-full.md +941 -285
  30. package/docs/research/paper-short.md +216 -585
  31. package/docs/research/references.bib +132 -0
  32. package/docs/research/slides-header.tex +188 -0
  33. package/docs/research/slides-pptx.md +363 -0
  34. package/docs/research/slides.md +531 -0
  35. package/docs/research/style-reference-pptx.py +199 -0
  36. package/package.json +6 -5
  37. package/scripts/analyze-eval-results.js +69 -17
  38. package/scripts/analyze-mechanism-traces.js +763 -0
  39. package/scripts/analyze-modulation-learning.js +498 -0
  40. package/scripts/analyze-prosthesis.js +144 -0
  41. package/scripts/analyze-run.js +264 -79
  42. package/scripts/assess-transcripts.js +853 -0
  43. package/scripts/browse-transcripts.js +854 -0
  44. package/scripts/check-parse-failures.js +73 -0
  45. package/scripts/code-dialectical-modulation.js +1320 -0
  46. package/scripts/download-data.sh +55 -0
  47. package/scripts/eval-cli.js +106 -18
  48. package/scripts/generate-paper-figures.js +663 -0
  49. package/scripts/generate-paper-figures.py +577 -76
  50. package/scripts/generate-paper-tables.js +299 -0
  51. package/scripts/qualitative-analysis-ai.js +3 -3
  52. package/scripts/render-sequence-diagram.js +694 -0
  53. package/scripts/test-latency.js +210 -0
  54. package/scripts/test-rate-limit.js +95 -0
  55. package/scripts/test-token-budget.js +332 -0
  56. package/scripts/validate-paper-manifest.js +670 -0
  57. package/services/__tests__/evalConfigLoader.test.js +2 -2
  58. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  59. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  60. package/services/evaluationRunner.js +975 -98
  61. package/services/evaluationStore.js +12 -4
  62. package/services/learnerTutorInteractionEngine.js +27 -2
  63. package/services/mockProvider.js +133 -0
  64. package/services/promptRewriter.js +1471 -5
  65. package/services/rubricEvaluator.js +55 -2
  66. package/services/transcriptFormatter.js +675 -0
  67. package/docs/EVALUATION-VARIABLES.md +0 -589
  68. package/docs/REPLICATION-PLAN.md +0 -577
  69. package/scripts/analyze-run.mjs +0 -282
  70. package/scripts/compare-runs.js +0 -44
  71. package/scripts/compare-suggestions.js +0 -80
  72. package/scripts/dig-into-run.js +0 -158
  73. package/scripts/show-failed-suggestions.js +0 -64
  74. /package/scripts/{check-run.mjs → check-run.js} +0 -0
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Download evaluation databases from the GitHub release.
4
+ # Creates symlinks in data/ pointing to ~/.machinespirits-data/.
5
+ #
6
+ # Usage: ./scripts/download-data.sh [--tag v2.3.14]
7
+ #
8
+ # Requires: gh (GitHub CLI), authenticated
9
+
10
+ set -euo pipefail
11
+
12
+ REPO="liammagee/machinespirits-eval"
13
+ TAG="${1:-v2.3.14}"
14
+ DATA_DIR="$HOME/.machinespirits-data"
15
+ LINK_DIR="$(cd "$(dirname "$0")/.." && pwd)/data"
16
+
17
+ # Strip --tag prefix if provided
18
+ TAG="${TAG#--tag }"
19
+ TAG="${TAG#--tag=}"
20
+
21
+ echo "Downloading evaluation databases from release ${TAG}..."
22
+ echo " Target: ${DATA_DIR}"
23
+ echo ""
24
+
25
+ mkdir -p "${DATA_DIR}"
26
+
27
+ DB_FILES=(evaluations.db learner-writing-pad.db tutor-writing-pad.db writing-pads.db)
28
+
29
+ for f in "${DB_FILES[@]}"; do
30
+ if [ -f "${DATA_DIR}/${f}" ]; then
31
+ echo " [skip] ${f} (already exists)"
32
+ else
33
+ echo " [download] ${f}..."
34
+ gh release download "${TAG}" --repo "${REPO}" --pattern "${f}" --dir "${DATA_DIR}"
35
+ fi
36
+ done
37
+
38
+ echo ""
39
+ echo "Creating symlinks in ${LINK_DIR}/..."
40
+
41
+ for f in "${DB_FILES[@]}"; do
42
+ target="${DATA_DIR}/${f}"
43
+ link="${LINK_DIR}/${f}"
44
+ if [ -L "${link}" ]; then
45
+ echo " [skip] ${f} (symlink exists)"
46
+ elif [ -f "${link}" ]; then
47
+ echo " [skip] ${f} (regular file exists — remove manually if you want a symlink)"
48
+ else
49
+ ln -s "${target}" "${link}"
50
+ echo " [link] ${f} -> ${target}"
51
+ fi
52
+ done
53
+
54
+ echo ""
55
+ echo "Done. Verify with: node -e \"import Database from 'better-sqlite3'; const db = new Database('data/evaluations.db'); console.log(db.prepare('SELECT COUNT(*) as n FROM evaluation_results').get())\""
@@ -44,6 +44,7 @@ import 'dotenv/config';
44
44
  * --refresh <ms> Refresh interval for 'watch' (default: 2000) or 'evaluate --follow' (default: 5000)
45
45
  * --force Actually complete stale runs (for 'cleanup'; dry-run without it)
46
46
  * --older-than <min> Staleness threshold in minutes (for 'cleanup', default: 30)
47
+ * --dry-run Use mock data instead of API calls (no API keys required)
47
48
  *
48
49
  * The default `run` uses the 2x2x2 factorial design:
49
50
  * Factor A: Recognition prompts (off / on)
@@ -68,6 +69,7 @@ import { buildLearnerEvaluationPrompt, calculateLearnerOverallScore } from '../s
68
69
  import { readProgressLog, getProgressLogPath } from '../services/progressLogger.js';
69
70
  import * as evalConfigLoader from '../services/evalConfigLoader.js';
70
71
  const { getScenario } = evalConfigLoader;
72
+ import { formatTranscript } from '../services/transcriptFormatter.js';
71
73
  import { spawn } from 'child_process';
72
74
  import readline from 'readline';
73
75
  import fs from 'fs';
@@ -767,15 +769,17 @@ async function main() {
767
769
  const scenarioId = getOption('scenario', 'new_user_first_visit');
768
770
  const profile = getOption('profile', 'budget');
769
771
  const verbose = getFlag('verbose');
772
+ const dryRun = getFlag('dry-run');
770
773
  const evalSettingsQt = evalConfigLoader.getEvalSettings();
771
- const skipRubricEval = getFlag('skip-rubric') || !evalSettingsQt.useAIJudge;
774
+ const skipRubricEval = dryRun ? false : (getFlag('skip-rubric') || !evalSettingsQt.useAIJudge);
772
775
  const config = { profileName: profile };
773
776
 
774
- console.log(`\nRunning quick test (profile: ${profile}, scenario: ${scenarioId})...\n`);
777
+ console.log(`\nRunning quick test (profile: ${profile}, scenario: ${scenarioId}${dryRun ? ', dry-run' : ''})...\n`);
775
778
  const result = await evaluationRunner.quickTest(config, {
776
779
  scenarioId,
777
780
  verbose,
778
781
  skipRubricEval,
782
+ dryRun,
779
783
  });
780
784
  console.log('\nResult:');
781
785
  console.log(JSON.stringify(result, null, 2));
@@ -784,9 +788,11 @@ async function main() {
784
788
 
785
789
  case 'run': {
786
790
  const verbose = getFlag('verbose');
791
+ const dryRun = getFlag('dry-run');
787
792
  // CLI --use-rubric forces rubric on; --skip-rubric forces off; otherwise use config default
793
+ // --dry-run always enables rubric (mock judge has no cost)
788
794
  const evalSettings = evalConfigLoader.getEvalSettings();
789
- const skipRubricEval = getFlag('use-rubric') ? false : (getFlag('skip-rubric') || !evalSettings.useAIJudge);
795
+ const skipRubricEval = dryRun ? false : (getFlag('use-rubric') ? false : (getFlag('skip-rubric') || !evalSettings.useAIJudge));
790
796
  const runsPerConfig = parseInt(getOption('runs', '1'), 10);
791
797
  const parallelism = parseInt(getOption('parallelism', '2'), 10);
792
798
  const description = getOption('description');
@@ -796,6 +802,9 @@ async function main() {
796
802
  const modelOverride = getOption('model');
797
803
  const egoModelOverride = getOption('ego-model');
798
804
  const superegoModelOverride = getOption('superego-model');
805
+ const learnerModelOverride = getOption('learner-model');
806
+ const transcriptMode = getFlag('transcript');
807
+ const maxTokensOverride = getOption('max-tokens');
799
808
 
800
809
  // --cluster and --scenario are mutually exclusive
801
810
  if (clusterOpt && scenarioOpt) {
@@ -847,6 +856,8 @@ async function main() {
847
856
  if (egoModelOverride) console.log(` Ego model override: ${egoModelOverride}`);
848
857
  if (superegoModelOverride) console.log(` Superego model override: ${superegoModelOverride}`);
849
858
  }
859
+ if (learnerModelOverride) console.log(` Learner model override: ${learnerModelOverride}`);
860
+ if (maxTokensOverride) console.log(` Max tokens override: ${maxTokensOverride}`);
850
861
  console.log('');
851
862
  }
852
863
 
@@ -860,12 +871,16 @@ async function main() {
860
871
  runsPerConfig,
861
872
  parallelism,
862
873
  skipRubricEval,
863
- description: description || (isFactorial ? '2x2x2 Factorial Evaluation' : null),
874
+ description: description || (dryRun ? 'Dry-run evaluation (mock data)' : (isFactorial ? '2x2x2 Factorial Evaluation' : null)),
864
875
  verbose,
865
876
  scenarioFilter: clusterOpt || null,
866
877
  modelOverride: modelOverride || null,
867
878
  egoModelOverride: egoModelOverride || null,
868
879
  superegoModelOverride: superegoModelOverride || null,
880
+ learnerModelOverride: learnerModelOverride || null,
881
+ dryRun,
882
+ transcriptMode,
883
+ maxTokensOverride: maxTokensOverride ? parseInt(maxTokensOverride, 10) : null,
869
884
  });
870
885
  // Extract unique model aliases used across all configs (ego + superego)
871
886
  const extractAlias = (raw) => {
@@ -884,6 +899,63 @@ async function main() {
884
899
  if (modelAliases.length > 0) {
885
900
  console.log(`Models: ${modelAliases.join(', ')}`);
886
901
  }
902
+
903
+ // Token / cost / latency summary report
904
+ if (result.runId) {
905
+ const runResults = evaluationStore.getResults(result.runId);
906
+ if (runResults.length > 0) {
907
+ console.log('\n' + '='.repeat(80));
908
+ console.log(' TOKEN & COST SUMMARY');
909
+ console.log('='.repeat(80));
910
+
911
+ // Per-result breakdown
912
+ const header = ' # | Scenario | In Tok | Out Tok | API | Rounds | Latency | Cost';
913
+ const divider = ' ' + '-'.repeat(header.length - 2);
914
+ console.log(header);
915
+ console.log(divider);
916
+
917
+ let totalIn = 0, totalOut = 0, totalApi = 0, totalRounds = 0, totalLatency = 0, totalCost = 0;
918
+
919
+ runResults.forEach((r, i) => {
920
+ const inTok = r.input_tokens || r.inputTokens || 0;
921
+ const outTok = r.output_tokens || r.outputTokens || 0;
922
+ const apiCalls = r.api_calls || r.apiCalls || 0;
923
+ const rounds = r.dialogue_rounds || r.dialogueRounds || 0;
924
+ const latMs = r.latency_ms || r.latencyMs || 0;
925
+ const cost = r.cost || 0;
926
+
927
+ totalIn += inTok;
928
+ totalOut += outTok;
929
+ totalApi += apiCalls;
930
+ totalRounds += rounds;
931
+ totalLatency += latMs;
932
+ totalCost += cost;
933
+
934
+ const scenLabel = (r.scenario_id || r.scenarioId || '').substring(0, 32).padEnd(32);
935
+ const latStr = latMs >= 1000 ? `${(latMs / 1000).toFixed(1)}s` : `${latMs}ms`;
936
+ const costStr = cost > 0 ? `$${cost.toFixed(4)}` : '-';
937
+ console.log(` ${String(i + 1).padStart(2)} | ${scenLabel} | ${String(inTok).padStart(7)} | ${String(outTok).padStart(7)} | ${String(apiCalls).padStart(4)} | ${String(rounds).padStart(6)} | ${latStr.padStart(9)} | ${costStr}`);
938
+ });
939
+
940
+ console.log(divider);
941
+ const totalLatStr = totalLatency >= 1000 ? `${(totalLatency / 1000).toFixed(1)}s` : `${totalLatency}ms`;
942
+ const totalCostStr = totalCost > 0 ? `$${totalCost.toFixed(4)}` : '-';
943
+ console.log(` ${'TOTAL'.padStart(2)} | ${''.padEnd(32)} | ${String(totalIn).padStart(7)} | ${String(totalOut).padStart(7)} | ${String(totalApi).padStart(4)} | ${String(totalRounds).padStart(6)} | ${totalLatStr.padStart(9)} | ${totalCostStr}`);
944
+
945
+ // Per-token cost efficiency
946
+ const totalTok = totalIn + totalOut;
947
+ if (totalTok > 0) {
948
+ const avgLatPerCall = totalApi > 0 ? (totalLatency / totalApi / 1000).toFixed(2) : '-';
949
+ console.log(`\n Tokens: ${totalTok.toLocaleString()} total (${totalIn.toLocaleString()} in + ${totalOut.toLocaleString()} out)`);
950
+ console.log(` Avg latency/API call: ${avgLatPerCall}s | Results: ${runResults.length} | API calls: ${totalApi}`);
951
+ if (totalCost > 0) {
952
+ console.log(` Cost/1K tokens: $${(totalCost / totalTok * 1000).toFixed(4)}`);
953
+ }
954
+ }
955
+ console.log('='.repeat(80));
956
+ }
957
+ }
958
+
887
959
  console.log(JSON.stringify(result, null, 2));
888
960
 
889
961
  // Factorial post-analysis: print cell means and ANOVA for each score type
@@ -964,6 +1036,11 @@ async function main() {
964
1036
  } else if (run.completedResults > 0) {
965
1037
  progress = `${run.completedResults} done`;
966
1038
  }
1039
+ // Show per-turn progress for running multi-turn tests
1040
+ const turnProgress = run.metadata?.turnProgress;
1041
+ if (run.status === 'running' && turnProgress) {
1042
+ progress += ` T${turnProgress.current}/${turnProgress.total}`;
1043
+ }
967
1044
  const avg = run.avgScore != null ? run.avgScore.toFixed(1) : '--';
968
1045
  // Duration formatting
969
1046
  let duration = '--';
@@ -1223,11 +1300,18 @@ async function main() {
1223
1300
  case 'transcript': {
1224
1301
  const runId = args.find(a => !a.startsWith('--') && a !== 'transcript');
1225
1302
  if (!runId) {
1226
- console.error('Usage: eval-cli.js transcript <runId> [--scenario <id>]');
1303
+ console.error('Usage: eval-cli.js transcript <runId> [--scenario <id>] [--detail play|compact|messages-only|full|bilateral]');
1227
1304
  process.exit(1);
1228
1305
  }
1229
1306
 
1230
1307
  const scenarioFilter = getOption('scenario');
1308
+ // Determine detail level: --compact and --messages-only are shortcuts, --detail is explicit
1309
+ let detailLevel = getOption('detail') || 'play';
1310
+ if (getFlag('compact')) detailLevel = 'compact';
1311
+ if (getFlag('messages-only')) detailLevel = 'messages-only';
1312
+ if (getFlag('full')) detailLevel = 'full';
1313
+ if (getFlag('bilateral')) detailLevel = 'bilateral';
1314
+
1231
1315
  const results = evaluationStore.getResults(runId, {
1232
1316
  scenarioId: scenarioFilter || null,
1233
1317
  });
@@ -1237,7 +1321,7 @@ async function main() {
1237
1321
  break;
1238
1322
  }
1239
1323
 
1240
- console.log(`\nTranscripts for run: ${runId} (${results.length} results)\n`);
1324
+ console.log(`\nTranscripts for run: ${runId} (${results.length} results, detail: ${detailLevel})\n`);
1241
1325
 
1242
1326
  for (const result of results) {
1243
1327
  console.log('='.repeat(80));
@@ -1246,10 +1330,9 @@ async function main() {
1246
1330
  console.log(`Score: ${result.overallScore != null ? result.overallScore.toFixed(1) : '--'} | Success: ${result.success}`);
1247
1331
  console.log('-'.repeat(80));
1248
1332
 
1249
- // Try dialogue log file first
1333
+ // Try dialogue log file first (rich trace with metadata)
1250
1334
  let printed = false;
1251
1335
  if (result.dialogueId) {
1252
- // Search for the dialogue file (may include date prefix in filename)
1253
1336
  const files = fs.existsSync(LOGS_DIR)
1254
1337
  ? fs.readdirSync(LOGS_DIR).filter(f => f.includes(result.dialogueId))
1255
1338
  : [];
@@ -1258,24 +1341,29 @@ async function main() {
1258
1341
  try {
1259
1342
  const dialogue = JSON.parse(fs.readFileSync(path.join(LOGS_DIR, files[0]), 'utf-8'));
1260
1343
  const trace = dialogue.dialogueTrace || [];
1261
- for (const entry of trace) {
1262
- console.log(formatTraceEntry(entry));
1263
- console.log('');
1344
+ if (trace.length > 0) {
1345
+ const formatted = formatTranscript(trace, {
1346
+ detail: detailLevel,
1347
+ scenarioName: result.scenarioName || result.scenarioId,
1348
+ profileName: result.profileName,
1349
+ totalTurns: dialogue.totalTurns || 0,
1350
+ });
1351
+ console.log(formatted);
1352
+ printed = true;
1264
1353
  }
1265
- if (trace.length > 0) printed = true;
1266
1354
  } catch (e) {
1267
- // Fall through to suggestions
1355
+ // Fall through to legacy format
1268
1356
  }
1269
1357
  }
1270
1358
  }
1271
1359
 
1272
- // Fall back to suggestions / raw response from DB
1360
+ // Fall back to legacy format (suggestions / raw response from DB)
1273
1361
  if (!printed) {
1274
1362
  if (result.suggestions?.length > 0) {
1275
1363
  console.log('Suggestions:');
1276
1364
  for (const s of result.suggestions) {
1277
1365
  const text = typeof s === 'string' ? s : (s.text || s.content || JSON.stringify(s));
1278
- console.log(` ${text}`);
1366
+ console.log(` \u2022 ${text}`);
1279
1367
  }
1280
1368
  console.log('');
1281
1369
  }
@@ -1831,7 +1919,7 @@ async function main() {
1831
1919
  requiredMissing: parsed.validation?.required_missing || [],
1832
1920
  forbiddenFound: parsed.validation?.forbidden_found || [],
1833
1921
  summary: parsed.summary,
1834
- judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-code/opus',
1922
+ judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-opus-4.6',
1835
1923
  };
1836
1924
 
1837
1925
  evaluationStore.updateResultScores(result.id, evaluation);
@@ -2023,7 +2111,7 @@ async function main() {
2023
2111
  recognitionScore,
2024
2112
  scores: normalizedScores,
2025
2113
  summary: parsed.summary,
2026
- judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-code/opus',
2114
+ judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-opus-4.6',
2027
2115
  };
2028
2116
 
2029
2117
  // Save to dialogue log
@@ -2578,7 +2666,7 @@ async function main() {
2578
2666
  evaluationStore.updateResultLearnerScores(result.id, {
2579
2667
  scores: turnScores,
2580
2668
  overallScore: dialogueLearnerScore,
2581
- judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-code/opus',
2669
+ judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-opus-4.6',
2582
2670
  });
2583
2671
 
2584
2672
  allScores.push(dialogueLearnerScore);