@machinespirits/eval 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +161 -0
- package/config/eval-settings.yaml +18 -0
- package/config/evaluation-rubric-learner.yaml +277 -0
- package/config/evaluation-rubric.yaml +613 -0
- package/config/interaction-eval-scenarios.yaml +93 -50
- package/config/learner-agents.yaml +124 -193
- package/config/machinespirits-eval.code-workspace +11 -0
- package/config/providers.yaml +60 -0
- package/config/suggestion-scenarios.yaml +1399 -0
- package/config/tutor-agents.yaml +716 -0
- package/docs/EVALUATION-VARIABLES.md +589 -0
- package/docs/REPLICATION-PLAN.md +577 -0
- package/index.js +15 -6
- package/package.json +16 -22
- package/routes/evalRoutes.js +88 -36
- package/scripts/analyze-judge-reliability.js +401 -0
- package/scripts/analyze-run.js +97 -0
- package/scripts/analyze-run.mjs +282 -0
- package/scripts/analyze-validation-failures.js +141 -0
- package/scripts/check-run.mjs +17 -0
- package/scripts/code-impasse-strategies.js +1132 -0
- package/scripts/compare-runs.js +44 -0
- package/scripts/compare-suggestions.js +80 -0
- package/scripts/compare-transformation.js +116 -0
- package/scripts/dig-into-run.js +158 -0
- package/scripts/eval-cli.js +2626 -0
- package/scripts/generate-paper-figures.py +452 -0
- package/scripts/qualitative-analysis-ai.js +1313 -0
- package/scripts/qualitative-analysis.js +688 -0
- package/scripts/seed-db.js +87 -0
- package/scripts/show-failed-suggestions.js +64 -0
- package/scripts/validate-content.js +192 -0
- package/server.js +3 -2
- package/services/__tests__/evalConfigLoader.test.js +338 -0
- package/services/anovaStats.js +499 -0
- package/services/contentResolver.js +407 -0
- package/services/dialogueTraceAnalyzer.js +454 -0
- package/services/evalConfigLoader.js +625 -0
- package/services/evaluationRunner.js +2171 -270
- package/services/evaluationStore.js +564 -29
- package/services/learnerConfigLoader.js +75 -5
- package/services/learnerRubricEvaluator.js +284 -0
- package/services/learnerTutorInteractionEngine.js +375 -0
- package/services/processUtils.js +18 -0
- package/services/progressLogger.js +98 -0
- package/services/promptRecommendationService.js +31 -26
- package/services/promptRewriter.js +427 -0
- package/services/rubricEvaluator.js +543 -70
- package/services/streamingReporter.js +104 -0
- package/services/turnComparisonAnalyzer.js +494 -0
- package/components/MobileEvalDashboard.tsx +0 -267
- package/components/comparison/DeltaAnalysisTable.tsx +0 -137
- package/components/comparison/ProfileComparisonCard.tsx +0 -176
- package/components/comparison/RecognitionABMode.tsx +0 -385
- package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
- package/components/comparison/WinnerIndicator.tsx +0 -64
- package/components/comparison/index.ts +0 -5
- package/components/mobile/BottomSheet.tsx +0 -233
- package/components/mobile/DimensionBreakdown.tsx +0 -210
- package/components/mobile/DocsView.tsx +0 -363
- package/components/mobile/LogsView.tsx +0 -481
- package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
- package/components/mobile/QuickTestView.tsx +0 -1098
- package/components/mobile/RecognitionTypeChart.tsx +0 -124
- package/components/mobile/RecognitionView.tsx +0 -809
- package/components/mobile/RunDetailView.tsx +0 -261
- package/components/mobile/RunHistoryView.tsx +0 -367
- package/components/mobile/ScoreRadial.tsx +0 -211
- package/components/mobile/StreamingLogPanel.tsx +0 -230
- package/components/mobile/SynthesisStrategyChart.tsx +0 -140
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
- package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
- package/docs/research/COST-ANALYSIS.md +0 -56
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
- package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
- package/docs/research/PAPER-UNIFIED.md +0 -659
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
- package/docs/research/apa.csl +0 -2133
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
- package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
- package/docs/research/paper-draft/full-paper.md +0 -136
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +0 -515
- package/docs/research/transcript-baseline.md +0 -139
- package/docs/research/transcript-recognition-multiagent.md +0 -187
- package/hooks/useEvalData.ts +0 -625
- package/server-init.js +0 -45
- package/services/benchmarkService.js +0 -1892
- package/types.ts +0 -165
- package/utils/haptics.ts +0 -45
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Inter-Judge Reliability Analysis
|
|
4
|
+
*
|
|
5
|
+
* Calculates agreement metrics between AI judges that scored the SAME responses.
|
|
6
|
+
*
|
|
7
|
+
* IMPORTANT: This requires paired data where identical responses were scored by
|
|
8
|
+
* multiple judges. Generate this by rejudging an existing run:
|
|
9
|
+
*
|
|
10
|
+
* node scripts/eval-cli.js rejudge <runId> --judge openrouter/anthropic/claude-sonnet-4.5
|
|
11
|
+
* node scripts/eval-cli.js rejudge <runId> --judge openrouter/moonshotai/kimi-k2.5
|
|
12
|
+
*
|
|
13
|
+
* The script matches responses by their `suggestions` content (MD5 hash) to find
|
|
14
|
+
* cases where the exact same tutor output was scored by different judges.
|
|
15
|
+
*
|
|
16
|
+
* Reports:
|
|
17
|
+
* - Pearson correlation (linear agreement)
|
|
18
|
+
* - Spearman rank correlation (ordinal agreement)
|
|
19
|
+
* - Mean absolute difference (calibration)
|
|
20
|
+
* - Per-dimension agreement
|
|
21
|
+
*
|
|
22
|
+
* Usage:
|
|
23
|
+
* node scripts/analyze-judge-reliability.js # All data
|
|
24
|
+
* node scripts/analyze-judge-reliability.js --run <runId> # Specific run
|
|
25
|
+
* node scripts/analyze-judge-reliability.js --verbose # Show disagreements
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
import Database from 'better-sqlite3';
|
|
29
|
+
import path from 'path';
|
|
30
|
+
import { fileURLToPath } from 'url';
|
|
31
|
+
|
|
32
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
33
|
+
const __dirname = path.dirname(__filename);
|
|
34
|
+
const DB_PATH = path.join(__dirname, '..', 'data', 'evaluations.db');
|
|
35
|
+
|
|
36
|
+
// Parse CLI args
|
|
37
|
+
const args = process.argv.slice(2);
|
|
38
|
+
const getOption = (name) => {
|
|
39
|
+
const idx = args.indexOf(`--${name}`);
|
|
40
|
+
return idx !== -1 && args[idx + 1] ? args[idx + 1] : null;
|
|
41
|
+
};
|
|
42
|
+
const hasFlag = (name) => args.includes(`--${name}`);
|
|
43
|
+
|
|
44
|
+
const runIdFilter = getOption('run');
|
|
45
|
+
const verbose = hasFlag('verbose');
|
|
46
|
+
|
|
47
|
+
// Statistics helpers
|
|
48
|
+
function mean(arr) {
|
|
49
|
+
if (!arr.length) return 0;
|
|
50
|
+
return arr.reduce((s, v) => s + v, 0) / arr.length;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function std(arr) {
|
|
54
|
+
if (arr.length < 2) return 0;
|
|
55
|
+
const m = mean(arr);
|
|
56
|
+
const variance = arr.reduce((s, v) => s + (v - m) ** 2, 0) / (arr.length - 1);
|
|
57
|
+
return Math.sqrt(variance);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function pearsonCorrelation(x, y) {
|
|
61
|
+
if (x.length !== y.length || x.length < 3) return null;
|
|
62
|
+
const mx = mean(x);
|
|
63
|
+
const my = mean(y);
|
|
64
|
+
const sx = std(x);
|
|
65
|
+
const sy = std(y);
|
|
66
|
+
if (sx === 0 || sy === 0) return null;
|
|
67
|
+
|
|
68
|
+
let sum = 0;
|
|
69
|
+
for (let i = 0; i < x.length; i++) {
|
|
70
|
+
sum += (x[i] - mx) * (y[i] - my);
|
|
71
|
+
}
|
|
72
|
+
return sum / ((x.length - 1) * sx * sy);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
function spearmanCorrelation(x, y) {
|
|
76
|
+
if (x.length !== y.length || x.length < 3) return null;
|
|
77
|
+
|
|
78
|
+
// Convert to ranks
|
|
79
|
+
const rankify = (arr) => {
|
|
80
|
+
const sorted = arr.map((v, i) => ({ v, i })).sort((a, b) => a.v - b.v);
|
|
81
|
+
const ranks = new Array(arr.length);
|
|
82
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
83
|
+
ranks[sorted[i].i] = i + 1;
|
|
84
|
+
}
|
|
85
|
+
return ranks;
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
const rx = rankify(x);
|
|
89
|
+
const ry = rankify(y);
|
|
90
|
+
return pearsonCorrelation(rx, ry);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function meanAbsoluteDifference(x, y) {
|
|
94
|
+
if (x.length !== y.length || x.length === 0) return null;
|
|
95
|
+
let sum = 0;
|
|
96
|
+
for (let i = 0; i < x.length; i++) {
|
|
97
|
+
sum += Math.abs(x[i] - y[i]);
|
|
98
|
+
}
|
|
99
|
+
return sum / x.length;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function cronbachAlpha(items) {
|
|
103
|
+
// items: array of arrays, each inner array is scores from one rater
|
|
104
|
+
// Returns alpha for internal consistency
|
|
105
|
+
if (items.length < 2 || items[0].length < 2) return null;
|
|
106
|
+
|
|
107
|
+
const k = items.length;
|
|
108
|
+
const n = items[0].length;
|
|
109
|
+
|
|
110
|
+
// Calculate variance of each item and total
|
|
111
|
+
const itemVariances = items.map(item => {
|
|
112
|
+
const m = mean(item);
|
|
113
|
+
return item.reduce((s, v) => s + (v - m) ** 2, 0) / (n - 1);
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
// Total scores per subject
|
|
117
|
+
const totals = [];
|
|
118
|
+
for (let i = 0; i < n; i++) {
|
|
119
|
+
totals.push(items.reduce((s, item) => s + item[i], 0));
|
|
120
|
+
}
|
|
121
|
+
const totalVariance = (() => {
|
|
122
|
+
const m = mean(totals);
|
|
123
|
+
return totals.reduce((s, v) => s + (v - m) ** 2, 0) / (n - 1);
|
|
124
|
+
})();
|
|
125
|
+
|
|
126
|
+
const sumItemVariances = itemVariances.reduce((s, v) => s + v, 0);
|
|
127
|
+
|
|
128
|
+
if (totalVariance === 0) return null;
|
|
129
|
+
return (k / (k - 1)) * (1 - sumItemVariances / totalVariance);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Simple hash for grouping identical responses
|
|
133
|
+
function simpleHash(str) {
|
|
134
|
+
if (!str) return null;
|
|
135
|
+
let hash = 0;
|
|
136
|
+
for (let i = 0; i < str.length; i++) {
|
|
137
|
+
const char = str.charCodeAt(i);
|
|
138
|
+
hash = ((hash << 5) - hash) + char;
|
|
139
|
+
hash = hash & hash; // Convert to 32bit integer
|
|
140
|
+
}
|
|
141
|
+
return hash.toString(16);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Main analysis
|
|
145
|
+
function analyzeJudgeReliability() {
|
|
146
|
+
const db = new Database(DB_PATH, { readonly: true });
|
|
147
|
+
|
|
148
|
+
console.log('Inter-Judge Reliability Analysis');
|
|
149
|
+
console.log('='.repeat(60));
|
|
150
|
+
console.log('');
|
|
151
|
+
|
|
152
|
+
// Find all judge models
|
|
153
|
+
const judges = db.prepare(`
|
|
154
|
+
SELECT DISTINCT judge_model
|
|
155
|
+
FROM evaluation_results
|
|
156
|
+
WHERE judge_model IS NOT NULL
|
|
157
|
+
`).all().map(r => r.judge_model);
|
|
158
|
+
|
|
159
|
+
console.log(`Judges found: ${judges.join(', ')}`);
|
|
160
|
+
console.log('');
|
|
161
|
+
|
|
162
|
+
// Find paired judgments - must be SAME response content judged by different models
|
|
163
|
+
// Match on suggestions content (the actual tutor response), not just scenario/profile
|
|
164
|
+
let whereClause = 'WHERE judge_model IS NOT NULL AND overall_score IS NOT NULL AND suggestions IS NOT NULL';
|
|
165
|
+
if (runIdFilter) {
|
|
166
|
+
whereClause += ` AND run_id = '${runIdFilter}'`;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
const pairedQuery = `
|
|
170
|
+
SELECT
|
|
171
|
+
run_id,
|
|
172
|
+
scenario_id,
|
|
173
|
+
profile_name,
|
|
174
|
+
judge_model,
|
|
175
|
+
overall_score,
|
|
176
|
+
score_relevance,
|
|
177
|
+
score_specificity,
|
|
178
|
+
score_pedagogical,
|
|
179
|
+
score_personalization,
|
|
180
|
+
score_actionability,
|
|
181
|
+
score_tone,
|
|
182
|
+
suggestions
|
|
183
|
+
FROM evaluation_results
|
|
184
|
+
${whereClause}
|
|
185
|
+
ORDER BY suggestions, judge_model
|
|
186
|
+
`;
|
|
187
|
+
|
|
188
|
+
const results = db.prepare(pairedQuery).all();
|
|
189
|
+
|
|
190
|
+
// Group by RESPONSE CONTENT (suggestions hash) - not scenario/profile
|
|
191
|
+
// This ensures we only compare when the exact same response was judged multiple times
|
|
192
|
+
const responseGroups = new Map();
|
|
193
|
+
|
|
194
|
+
for (const r of results) {
|
|
195
|
+
// Use suggestions content hash as grouping key
|
|
196
|
+
const contentHash = simpleHash(r.suggestions);
|
|
197
|
+
if (!contentHash) continue;
|
|
198
|
+
|
|
199
|
+
const key = contentHash;
|
|
200
|
+
if (!responseGroups.has(key)) {
|
|
201
|
+
responseGroups.set(key, []);
|
|
202
|
+
}
|
|
203
|
+
responseGroups.get(key).push(r);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Count how many responses have multiple judgments
|
|
207
|
+
let responsesWithMultipleJudges = 0;
|
|
208
|
+
for (const [key, group] of responseGroups) {
|
|
209
|
+
const uniqueJudges = new Set(group.map(r => r.judge_model));
|
|
210
|
+
if (uniqueJudges.size > 1) {
|
|
211
|
+
responsesWithMultipleJudges++;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if (responsesWithMultipleJudges === 0) {
|
|
216
|
+
console.log('⚠️ No paired judgments found!');
|
|
217
|
+
console.log('');
|
|
218
|
+
console.log('To analyze inter-judge reliability, you need the SAME response');
|
|
219
|
+
console.log('scored by multiple judges. Generate this data by rejudging a run:');
|
|
220
|
+
console.log('');
|
|
221
|
+
console.log(' # First, pick a completed run:');
|
|
222
|
+
console.log(' node scripts/eval-cli.js list');
|
|
223
|
+
console.log('');
|
|
224
|
+
console.log(' # Then rejudge with different models:');
|
|
225
|
+
console.log(' node scripts/eval-cli.js rejudge <runId> --judge openrouter/anthropic/claude-sonnet-4.5');
|
|
226
|
+
console.log(' node scripts/eval-cli.js rejudge <runId> --judge openrouter/moonshotai/kimi-k2.5');
|
|
227
|
+
console.log('');
|
|
228
|
+
console.log(' # Then run this analysis again');
|
|
229
|
+
console.log('');
|
|
230
|
+
db.close();
|
|
231
|
+
return;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
console.log(`Responses with multiple judges: ${responsesWithMultipleJudges}`);
|
|
235
|
+
console.log('');
|
|
236
|
+
|
|
237
|
+
// Find groups with multiple judges
|
|
238
|
+
const pairsData = [];
|
|
239
|
+
const judgePairs = new Map(); // "judgeA|judgeB" -> [{score1, score2, ...}]
|
|
240
|
+
|
|
241
|
+
for (const [key, group] of responseGroups) {
|
|
242
|
+
const judgeScores = new Map();
|
|
243
|
+
for (const r of group) {
|
|
244
|
+
if (!judgeScores.has(r.judge_model)) {
|
|
245
|
+
judgeScores.set(r.judge_model, []);
|
|
246
|
+
}
|
|
247
|
+
judgeScores.set(r.judge_model, r);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Only consider if multiple judges
|
|
251
|
+
if (judgeScores.size > 1) {
|
|
252
|
+
const judgeList = Array.from(judgeScores.keys()).sort();
|
|
253
|
+
|
|
254
|
+
// Create pairs for each combination
|
|
255
|
+
for (let i = 0; i < judgeList.length; i++) {
|
|
256
|
+
for (let j = i + 1; j < judgeList.length; j++) {
|
|
257
|
+
const pairKey = `${judgeList[i]}|${judgeList[j]}`;
|
|
258
|
+
if (!judgePairs.has(pairKey)) {
|
|
259
|
+
judgePairs.set(pairKey, []);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
const s1 = judgeScores.get(judgeList[i]);
|
|
263
|
+
const s2 = judgeScores.get(judgeList[j]);
|
|
264
|
+
|
|
265
|
+
judgePairs.get(pairKey).push({
|
|
266
|
+
judge1: judgeList[i],
|
|
267
|
+
judge2: judgeList[j],
|
|
268
|
+
score1: s1.overall_score,
|
|
269
|
+
score2: s2.overall_score,
|
|
270
|
+
diff: Math.abs(s1.overall_score - s2.overall_score),
|
|
271
|
+
dimensions: {
|
|
272
|
+
relevance: [s1.score_relevance, s2.score_relevance],
|
|
273
|
+
specificity: [s1.score_specificity, s2.score_specificity],
|
|
274
|
+
pedagogical: [s1.score_pedagogical, s2.score_pedagogical],
|
|
275
|
+
personalization: [s1.score_personalization, s2.score_personalization],
|
|
276
|
+
actionability: [s1.score_actionability, s2.score_actionability],
|
|
277
|
+
tone: [s1.score_tone, s2.score_tone]
|
|
278
|
+
},
|
|
279
|
+
scenario: s1.scenario_id,
|
|
280
|
+
profile: s1.profile_name
|
|
281
|
+
});
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
if (judgePairs.size === 0) {
|
|
288
|
+
console.log('No paired judgments found (same response scored by multiple judges).');
|
|
289
|
+
console.log('');
|
|
290
|
+
console.log('To generate paired data, use the rejudge command with a different model:');
|
|
291
|
+
console.log(' node scripts/eval-cli.js rejudge <runId> --judge openrouter/anthropic/claude-sonnet-4.5');
|
|
292
|
+
db.close();
|
|
293
|
+
return;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
console.log(`Found ${judgePairs.size} judge pair combinations`);
|
|
297
|
+
console.log('');
|
|
298
|
+
|
|
299
|
+
// Analyze each pair
|
|
300
|
+
const overallScores1 = [];
|
|
301
|
+
const overallScores2 = [];
|
|
302
|
+
const allDisagreements = [];
|
|
303
|
+
|
|
304
|
+
for (const [pairKey, pairs] of judgePairs) {
|
|
305
|
+
const [judge1, judge2] = pairKey.split('|');
|
|
306
|
+
const n = pairs.length;
|
|
307
|
+
|
|
308
|
+
const scores1 = pairs.map(p => p.score1);
|
|
309
|
+
const scores2 = pairs.map(p => p.score2);
|
|
310
|
+
|
|
311
|
+
overallScores1.push(...scores1);
|
|
312
|
+
overallScores2.push(...scores2);
|
|
313
|
+
|
|
314
|
+
const pearson = pearsonCorrelation(scores1, scores2);
|
|
315
|
+
const spearman = spearmanCorrelation(scores1, scores2);
|
|
316
|
+
const mad = meanAbsoluteDifference(scores1, scores2);
|
|
317
|
+
|
|
318
|
+
console.log(`\n${judge1.split('/').pop()} vs ${judge2.split('/').pop()}`);
|
|
319
|
+
console.log('-'.repeat(50));
|
|
320
|
+
console.log(` Paired responses: ${n}`);
|
|
321
|
+
console.log(` Pearson r: ${pearson !== null ? pearson.toFixed(3) : 'N/A'}`);
|
|
322
|
+
console.log(` Spearman ρ: ${spearman !== null ? spearman.toFixed(3) : 'N/A'}`);
|
|
323
|
+
console.log(` Mean Abs Diff: ${mad !== null ? mad.toFixed(2) : 'N/A'} pts`);
|
|
324
|
+
console.log(` Mean scores: ${mean(scores1).toFixed(1)} vs ${mean(scores2).toFixed(1)}`);
|
|
325
|
+
|
|
326
|
+
// Per-dimension analysis
|
|
327
|
+
const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
|
|
328
|
+
console.log('\n Per-dimension correlations:');
|
|
329
|
+
|
|
330
|
+
for (const dim of dimensions) {
|
|
331
|
+
const d1 = pairs.map(p => p.dimensions[dim][0]).filter(v => v != null);
|
|
332
|
+
const d2 = pairs.map(p => p.dimensions[dim][1]).filter(v => v != null);
|
|
333
|
+
|
|
334
|
+
if (d1.length >= 3 && d2.length >= 3) {
|
|
335
|
+
const r = pearsonCorrelation(d1, d2);
|
|
336
|
+
console.log(` ${dim.padEnd(16)} r = ${r !== null ? r.toFixed(3) : 'N/A'}`);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Identify major disagreements (diff > 20)
|
|
341
|
+
const bigDisagreements = pairs.filter(p => p.diff > 20);
|
|
342
|
+
if (bigDisagreements.length > 0) {
|
|
343
|
+
allDisagreements.push(...bigDisagreements);
|
|
344
|
+
console.log(`\n Major disagreements (diff > 20): ${bigDisagreements.length}`);
|
|
345
|
+
|
|
346
|
+
if (verbose) {
|
|
347
|
+
for (const d of bigDisagreements.slice(0, 5)) {
|
|
348
|
+
console.log(` ${d.scenario} / ${d.profile}: ${d.score1} vs ${d.score2} (Δ${d.diff.toFixed(0)})`);
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// Overall summary
|
|
355
|
+
console.log('\n' + '='.repeat(60));
|
|
356
|
+
console.log('OVERALL RELIABILITY SUMMARY');
|
|
357
|
+
console.log('='.repeat(60));
|
|
358
|
+
|
|
359
|
+
const totalPairs = overallScores1.length;
|
|
360
|
+
const overallPearson = pearsonCorrelation(overallScores1, overallScores2);
|
|
361
|
+
const overallSpearman = spearmanCorrelation(overallScores1, overallScores2);
|
|
362
|
+
const overallMAD = meanAbsoluteDifference(overallScores1, overallScores2);
|
|
363
|
+
|
|
364
|
+
console.log(`\nTotal paired judgments: ${totalPairs}`);
|
|
365
|
+
console.log(`Overall Pearson r: ${overallPearson !== null ? overallPearson.toFixed(3) : 'N/A'}`);
|
|
366
|
+
console.log(`Overall Spearman ρ: ${overallSpearman !== null ? overallSpearman.toFixed(3) : 'N/A'}`);
|
|
367
|
+
console.log(`Overall Mean Abs Diff: ${overallMAD !== null ? overallMAD.toFixed(2) : 'N/A'} pts`);
|
|
368
|
+
|
|
369
|
+
// Interpretation
|
|
370
|
+
console.log('\nInterpretation:');
|
|
371
|
+
if (overallPearson !== null) {
|
|
372
|
+
if (overallPearson >= 0.8) {
|
|
373
|
+
console.log(' ✓ Excellent agreement (r ≥ 0.8)');
|
|
374
|
+
} else if (overallPearson >= 0.6) {
|
|
375
|
+
console.log(' ○ Good agreement (0.6 ≤ r < 0.8)');
|
|
376
|
+
} else if (overallPearson >= 0.4) {
|
|
377
|
+
console.log(' △ Moderate agreement (0.4 ≤ r < 0.6)');
|
|
378
|
+
} else {
|
|
379
|
+
console.log(' ✗ Poor agreement (r < 0.4)');
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
if (overallMAD !== null) {
|
|
384
|
+
console.log(` Average score difference: ${overallMAD.toFixed(1)} points on 100-point scale`);
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
if (allDisagreements.length > 0) {
|
|
388
|
+
console.log(` ${allDisagreements.length} major disagreements (>20 pts) found`);
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
console.log('');
|
|
392
|
+
db.close();
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
// Run
|
|
396
|
+
try {
|
|
397
|
+
analyzeJudgeReliability();
|
|
398
|
+
} catch (err) {
|
|
399
|
+
console.error('Error:', err.message);
|
|
400
|
+
process.exit(1);
|
|
401
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import fs from 'fs';
|
|
3
|
+
|
|
4
|
+
const runId = process.argv[2] || 'eval-2026-02-03-c8d32121';
|
|
5
|
+
const logPath = `./logs/eval-progress/${runId}.jsonl`;
|
|
6
|
+
|
|
7
|
+
if (!fs.existsSync(logPath)) {
|
|
8
|
+
console.error('Log file not found:', logPath);
|
|
9
|
+
process.exit(1);
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
const lines = fs.readFileSync(logPath, 'utf8').split('\n').filter(l => l.trim());
|
|
13
|
+
const events = lines.map(l => JSON.parse(l));
|
|
14
|
+
|
|
15
|
+
// Filter to successful test_complete events
|
|
16
|
+
const successful = events.filter(e =>
|
|
17
|
+
e.eventType === 'test_complete' &&
|
|
18
|
+
e.success === true &&
|
|
19
|
+
e.overallScore != null
|
|
20
|
+
);
|
|
21
|
+
|
|
22
|
+
console.log('Run:', runId);
|
|
23
|
+
console.log('Total successful results:', successful.length);
|
|
24
|
+
console.log('');
|
|
25
|
+
|
|
26
|
+
// Group by profile
|
|
27
|
+
const byProfile = {};
|
|
28
|
+
for (const r of successful) {
|
|
29
|
+
const profile = r.profileName;
|
|
30
|
+
if (!byProfile[profile]) byProfile[profile] = [];
|
|
31
|
+
byProfile[profile].push(r.overallScore);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
console.log('By Profile (avg score):');
|
|
35
|
+
for (const [profile, scores] of Object.entries(byProfile).sort((a,b) => {
|
|
36
|
+
const avgA = a[1].reduce((s,v) => s+v, 0) / a[1].length;
|
|
37
|
+
const avgB = b[1].reduce((s,v) => s+v, 0) / b[1].length;
|
|
38
|
+
return avgB - avgA;
|
|
39
|
+
})) {
|
|
40
|
+
const avg = scores.reduce((s,v) => s+v, 0) / scores.length;
|
|
41
|
+
console.log(` ${profile}: ${avg.toFixed(1)} (n=${scores.length})`);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Factor analysis
|
|
45
|
+
const factors = {
|
|
46
|
+
'Factor A (recognition)': { on: [], off: [] },
|
|
47
|
+
'Factor B (tutor arch)': { multi: [], single: [] },
|
|
48
|
+
'Factor C (learner arch)': { psycho: [], unified: [] }
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
for (const r of successful) {
|
|
52
|
+
const profile = r.profileName;
|
|
53
|
+
const score = r.overallScore;
|
|
54
|
+
|
|
55
|
+
// Factor A: Recognition (cells 5-8 = on, cells 1-4 = off)
|
|
56
|
+
if (profile.includes('recog')) factors['Factor A (recognition)'].on.push(score);
|
|
57
|
+
else factors['Factor A (recognition)'].off.push(score);
|
|
58
|
+
|
|
59
|
+
// Factor B: Tutor arch (cells 3,4,7,8 = multi, cells 1,2,5,6 = single)
|
|
60
|
+
if (profile.includes('multi')) factors['Factor B (tutor arch)'].multi.push(score);
|
|
61
|
+
else factors['Factor B (tutor arch)'].single.push(score);
|
|
62
|
+
|
|
63
|
+
// Factor C: Learner arch (cells 2,4,6,8 = psycho, cells 1,3,5,7 = unified)
|
|
64
|
+
if (profile.includes('psycho')) factors['Factor C (learner arch)'].psycho.push(score);
|
|
65
|
+
else factors['Factor C (learner arch)'].unified.push(score);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
console.log('');
|
|
69
|
+
console.log('Factor Analysis:');
|
|
70
|
+
for (const [factor, levels] of Object.entries(factors)) {
|
|
71
|
+
const level1 = Object.keys(levels)[0];
|
|
72
|
+
const level2 = Object.keys(levels)[1];
|
|
73
|
+
const n1 = levels[level1].length;
|
|
74
|
+
const n2 = levels[level2].length;
|
|
75
|
+
if (n1 === 0 || n2 === 0) continue;
|
|
76
|
+
const avg1 = levels[level1].reduce((s,v) => s+v, 0) / n1;
|
|
77
|
+
const avg2 = levels[level2].reduce((s,v) => s+v, 0) / n2;
|
|
78
|
+
const delta = avg1 - avg2;
|
|
79
|
+
console.log(` ${factor}:`);
|
|
80
|
+
console.log(` ${level1}: ${avg1.toFixed(1)} (n=${n1})`);
|
|
81
|
+
console.log(` ${level2}: ${avg2.toFixed(1)} (n=${n2})`);
|
|
82
|
+
console.log(` Delta: ${delta > 0 ? '+' : ''}${delta.toFixed(1)}`);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// Group by scenario
|
|
86
|
+
console.log('');
|
|
87
|
+
console.log('By Scenario:');
|
|
88
|
+
const byScenario = {};
|
|
89
|
+
for (const r of successful) {
|
|
90
|
+
const scenario = r.scenarioId;
|
|
91
|
+
if (!byScenario[scenario]) byScenario[scenario] = [];
|
|
92
|
+
byScenario[scenario].push({ profile: r.profileName, score: r.overallScore });
|
|
93
|
+
}
|
|
94
|
+
for (const [scenario, data] of Object.entries(byScenario)) {
|
|
95
|
+
const avg = data.reduce((s,d) => s + d.score, 0) / data.length;
|
|
96
|
+
console.log(` ${scenario}: avg=${avg.toFixed(1)} (n=${data.length})`);
|
|
97
|
+
}
|