@machinespirits/eval 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -9
- package/config/eval-settings.yaml +3 -3
- package/config/paper-manifest.json +486 -0
- package/config/providers.yaml +9 -6
- package/config/tutor-agents.yaml +2261 -0
- package/content/README.md +23 -0
- package/content/courses/479/course.md +53 -0
- package/content/courses/479/lecture-1.md +361 -0
- package/content/courses/479/lecture-2.md +360 -0
- package/content/courses/479/lecture-3.md +655 -0
- package/content/courses/479/lecture-4.md +530 -0
- package/content/courses/479/lecture-5.md +326 -0
- package/content/courses/479/lecture-6.md +346 -0
- package/content/courses/479/lecture-7.md +326 -0
- package/content/courses/479/lecture-8.md +273 -0
- package/content/courses/479/roadmap-slides.md +656 -0
- package/content/manifest.yaml +8 -0
- package/docs/research/build.sh +44 -20
- package/docs/research/figures/figure10.png +0 -0
- package/docs/research/figures/figure11.png +0 -0
- package/docs/research/figures/figure3.png +0 -0
- package/docs/research/figures/figure4.png +0 -0
- package/docs/research/figures/figure5.png +0 -0
- package/docs/research/figures/figure6.png +0 -0
- package/docs/research/figures/figure7.png +0 -0
- package/docs/research/figures/figure8.png +0 -0
- package/docs/research/figures/figure9.png +0 -0
- package/docs/research/header.tex +23 -2
- package/docs/research/paper-full.md +941 -285
- package/docs/research/paper-short.md +216 -585
- package/docs/research/references.bib +132 -0
- package/docs/research/slides-header.tex +188 -0
- package/docs/research/slides-pptx.md +363 -0
- package/docs/research/slides.md +531 -0
- package/docs/research/style-reference-pptx.py +199 -0
- package/package.json +6 -5
- package/scripts/analyze-eval-results.js +69 -17
- package/scripts/analyze-mechanism-traces.js +763 -0
- package/scripts/analyze-modulation-learning.js +498 -0
- package/scripts/analyze-prosthesis.js +144 -0
- package/scripts/analyze-run.js +264 -79
- package/scripts/assess-transcripts.js +853 -0
- package/scripts/browse-transcripts.js +854 -0
- package/scripts/check-parse-failures.js +73 -0
- package/scripts/code-dialectical-modulation.js +1320 -0
- package/scripts/download-data.sh +55 -0
- package/scripts/eval-cli.js +106 -18
- package/scripts/generate-paper-figures.js +663 -0
- package/scripts/generate-paper-figures.py +577 -76
- package/scripts/generate-paper-tables.js +299 -0
- package/scripts/qualitative-analysis-ai.js +3 -3
- package/scripts/render-sequence-diagram.js +694 -0
- package/scripts/test-latency.js +210 -0
- package/scripts/test-rate-limit.js +95 -0
- package/scripts/test-token-budget.js +332 -0
- package/scripts/validate-paper-manifest.js +670 -0
- package/services/__tests__/evalConfigLoader.test.js +2 -2
- package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
- package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
- package/services/evaluationRunner.js +975 -98
- package/services/evaluationStore.js +12 -4
- package/services/learnerTutorInteractionEngine.js +27 -2
- package/services/mockProvider.js +133 -0
- package/services/promptRewriter.js +1471 -5
- package/services/rubricEvaluator.js +55 -2
- package/services/transcriptFormatter.js +675 -0
- package/docs/EVALUATION-VARIABLES.md +0 -589
- package/docs/REPLICATION-PLAN.md +0 -577
- package/scripts/analyze-run.mjs +0 -282
- package/scripts/compare-runs.js +0 -44
- package/scripts/compare-suggestions.js +0 -80
- package/scripts/dig-into-run.js +0 -158
- package/scripts/show-failed-suggestions.js +0 -64
- /package/scripts/{check-run.mjs → check-run.js} +0 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import fs from 'fs';
|
|
2
|
+
import Database from 'better-sqlite3';
|
|
3
|
+
const db = new Database('data/evaluations.db');
|
|
4
|
+
|
|
5
|
+
const runId = process.argv[2] || 'eval-2026-02-17-25aaae85';
|
|
6
|
+
const rows = db.prepare(
|
|
7
|
+
'SELECT dialogue_id, profile_name, overall_score FROM evaluation_results WHERE run_id = ? AND dialogue_id IS NOT NULL AND overall_score IS NOT NULL'
|
|
8
|
+
).all(runId);
|
|
9
|
+
|
|
10
|
+
let totalReviews = 0;
|
|
11
|
+
let parseFailures = 0;
|
|
12
|
+
let approved = 0;
|
|
13
|
+
let rejected = 0;
|
|
14
|
+
const byCell = {};
|
|
15
|
+
|
|
16
|
+
rows.forEach(r => {
|
|
17
|
+
const cell = r.profile_name.includes('66') ? '66_desc' : r.profile_name.includes('67') ? '67_presc' : '68_adv';
|
|
18
|
+
if (!byCell[cell]) byCell[cell] = { total: 0, parseFail: 0, approved: 0, rejected: 0, dialogues: 0 };
|
|
19
|
+
byCell[cell].dialogues++;
|
|
20
|
+
|
|
21
|
+
try {
|
|
22
|
+
const j = JSON.parse(fs.readFileSync('logs/tutor-dialogues/' + r.dialogue_id + '.json', 'utf8'));
|
|
23
|
+
const supTraces = (j.dialogueTrace || []).filter(t => t.agent === 'superego');
|
|
24
|
+
supTraces.forEach(t => {
|
|
25
|
+
totalReviews++;
|
|
26
|
+
byCell[cell].total++;
|
|
27
|
+
if ((t.feedback || '').includes('Unable to parse')) {
|
|
28
|
+
parseFailures++;
|
|
29
|
+
byCell[cell].parseFail++;
|
|
30
|
+
} else if (t.approved) {
|
|
31
|
+
approved++;
|
|
32
|
+
byCell[cell].approved++;
|
|
33
|
+
} else {
|
|
34
|
+
rejected++;
|
|
35
|
+
byCell[cell].rejected++;
|
|
36
|
+
}
|
|
37
|
+
});
|
|
38
|
+
} catch (e) { /* missing dialogue file */ }
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
console.log('=== Superego Parse Failures: ' + runId + ' ===\n');
|
|
42
|
+
console.log('Total superego reviews:', totalReviews);
|
|
43
|
+
console.log('Parse failures (auto-approve):', parseFailures, '(' + (parseFailures / totalReviews * 100).toFixed(1) + '%)');
|
|
44
|
+
console.log('Genuine approvals:', approved, '(' + (approved / totalReviews * 100).toFixed(1) + '%)');
|
|
45
|
+
console.log('Rejections:', rejected, '(' + (rejected / totalReviews * 100).toFixed(1) + '%)');
|
|
46
|
+
|
|
47
|
+
console.log('\n--- By cell ---\n');
|
|
48
|
+
console.log('Cell'.padEnd(12) + '| Dialogues | Reviews | Parse Fail | Approved | Rejected | Fail Rate');
|
|
49
|
+
console.log('-'.repeat(80));
|
|
50
|
+
Object.entries(byCell).sort().forEach(([cell, d]) => {
|
|
51
|
+
console.log(
|
|
52
|
+
cell.padEnd(12) + '| ' +
|
|
53
|
+
String(d.dialogues).padEnd(10) + '| ' +
|
|
54
|
+
String(d.total).padEnd(8) + '| ' +
|
|
55
|
+
String(d.parseFail).padEnd(11) + '| ' +
|
|
56
|
+
String(d.approved).padEnd(9) + '| ' +
|
|
57
|
+
String(d.rejected).padEnd(9) + '| ' +
|
|
58
|
+
(d.parseFail / d.total * 100).toFixed(1) + '%'
|
|
59
|
+
);
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
// Correlation: parse failure rate vs score
|
|
63
|
+
console.log('\n--- Parse failure rate vs mean score ---\n');
|
|
64
|
+
Object.entries(byCell).sort().forEach(([cell, d]) => {
|
|
65
|
+
const cellRows = rows.filter(r => {
|
|
66
|
+
const c = r.profile_name.includes('66') ? '66_desc' : r.profile_name.includes('67') ? '67_presc' : '68_adv';
|
|
67
|
+
return c === cell;
|
|
68
|
+
});
|
|
69
|
+
const meanScore = cellRows.reduce((s, r) => s + r.overall_score, 0) / cellRows.length;
|
|
70
|
+
console.log(cell + ': fail_rate=' + (d.parseFail / d.total * 100).toFixed(1) + '%, mean_score=' + meanScore.toFixed(1));
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
db.close();
|