@machinespirits/eval 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -9
- package/config/eval-settings.yaml +3 -3
- package/config/paper-manifest.json +486 -0
- package/config/providers.yaml +9 -6
- package/config/tutor-agents.yaml +2261 -0
- package/content/README.md +23 -0
- package/content/courses/479/course.md +53 -0
- package/content/courses/479/lecture-1.md +361 -0
- package/content/courses/479/lecture-2.md +360 -0
- package/content/courses/479/lecture-3.md +655 -0
- package/content/courses/479/lecture-4.md +530 -0
- package/content/courses/479/lecture-5.md +326 -0
- package/content/courses/479/lecture-6.md +346 -0
- package/content/courses/479/lecture-7.md +326 -0
- package/content/courses/479/lecture-8.md +273 -0
- package/content/courses/479/roadmap-slides.md +656 -0
- package/content/manifest.yaml +8 -0
- package/docs/research/build.sh +44 -20
- package/docs/research/figures/figure10.png +0 -0
- package/docs/research/figures/figure11.png +0 -0
- package/docs/research/figures/figure3.png +0 -0
- package/docs/research/figures/figure4.png +0 -0
- package/docs/research/figures/figure5.png +0 -0
- package/docs/research/figures/figure6.png +0 -0
- package/docs/research/figures/figure7.png +0 -0
- package/docs/research/figures/figure8.png +0 -0
- package/docs/research/figures/figure9.png +0 -0
- package/docs/research/header.tex +23 -2
- package/docs/research/paper-full.md +941 -285
- package/docs/research/paper-short.md +216 -585
- package/docs/research/references.bib +132 -0
- package/docs/research/slides-header.tex +188 -0
- package/docs/research/slides-pptx.md +363 -0
- package/docs/research/slides.md +531 -0
- package/docs/research/style-reference-pptx.py +199 -0
- package/package.json +6 -5
- package/scripts/analyze-eval-results.js +69 -17
- package/scripts/analyze-mechanism-traces.js +763 -0
- package/scripts/analyze-modulation-learning.js +498 -0
- package/scripts/analyze-prosthesis.js +144 -0
- package/scripts/analyze-run.js +264 -79
- package/scripts/assess-transcripts.js +853 -0
- package/scripts/browse-transcripts.js +854 -0
- package/scripts/check-parse-failures.js +73 -0
- package/scripts/code-dialectical-modulation.js +1320 -0
- package/scripts/download-data.sh +55 -0
- package/scripts/eval-cli.js +106 -18
- package/scripts/generate-paper-figures.js +663 -0
- package/scripts/generate-paper-figures.py +577 -76
- package/scripts/generate-paper-tables.js +299 -0
- package/scripts/qualitative-analysis-ai.js +3 -3
- package/scripts/render-sequence-diagram.js +694 -0
- package/scripts/test-latency.js +210 -0
- package/scripts/test-rate-limit.js +95 -0
- package/scripts/test-token-budget.js +332 -0
- package/scripts/validate-paper-manifest.js +670 -0
- package/services/__tests__/evalConfigLoader.test.js +2 -2
- package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
- package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
- package/services/evaluationRunner.js +975 -98
- package/services/evaluationStore.js +12 -4
- package/services/learnerTutorInteractionEngine.js +27 -2
- package/services/mockProvider.js +133 -0
- package/services/promptRewriter.js +1471 -5
- package/services/rubricEvaluator.js +55 -2
- package/services/transcriptFormatter.js +675 -0
- package/docs/EVALUATION-VARIABLES.md +0 -589
- package/docs/REPLICATION-PLAN.md +0 -577
- package/scripts/analyze-run.mjs +0 -282
- package/scripts/compare-runs.js +0 -44
- package/scripts/compare-suggestions.js +0 -80
- package/scripts/dig-into-run.js +0 -158
- package/scripts/show-failed-suggestions.js +0 -64
- /package/scripts/{check-run.mjs → check-run.js} +0 -0
|
@@ -0,0 +1,1320 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Dialectical Modulation Coding Script
|
|
5
|
+
*
|
|
6
|
+
* Extracts structural and LLM-coded modulation metrics from multi-turn
|
|
7
|
+
* dialectical superego dialogues. Compares across conditions (base vs
|
|
8
|
+
* recognition) × persona type (suspicious, adversary, advocate).
|
|
9
|
+
*
|
|
10
|
+
* Tier 1 (structural): Parsed directly from dialogueTrace JSON — no LLM calls.
|
|
11
|
+
* Tier 2 (LLM-coded): 4 semantic prompts per dialogue for stance reversal,
|
|
12
|
+
* cross-turn memory, hallucination correction, phase transition detection.
|
|
13
|
+
*
|
|
14
|
+
* Usage:
|
|
15
|
+
* node scripts/code-dialectical-modulation.js [options]
|
|
16
|
+
*
|
|
17
|
+
* Options:
|
|
18
|
+
* --run-id <id> Run ID (default: eval-2026-02-11-a54235ea)
|
|
19
|
+
* --model <model> Model for LLM coding (default: claude-code)
|
|
20
|
+
* --structural-only Skip LLM coding, emit only structural metrics
|
|
21
|
+
* --help Show help
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import 'dotenv/config';
|
|
25
|
+
import Database from 'better-sqlite3';
|
|
26
|
+
import fs from 'fs';
|
|
27
|
+
import path from 'path';
|
|
28
|
+
import { spawn } from 'child_process';
|
|
29
|
+
|
|
30
|
+
// ── Constants ────────────────────────────────────────────────────────────
|
|
31
|
+
|
|
32
|
+
const DEFAULT_RUN_ID = 'eval-2026-02-11-a54235ea';
|
|
33
|
+
|
|
34
|
+
const MODEL_MAP = {
|
|
35
|
+
'claude-code': 'claude-code',
|
|
36
|
+
haiku: 'anthropic/claude-haiku-4.5',
|
|
37
|
+
sonnet: 'anthropic/claude-sonnet-4.5',
|
|
38
|
+
gpt: 'openai/gpt-5.2',
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
// ── Statistical Helpers ──────────────────────────────────────────────────
|
|
42
|
+
|
|
43
|
+
function mean(arr) {
|
|
44
|
+
if (!arr.length) return 0;
|
|
45
|
+
return arr.reduce((a, b) => a + b, 0) / arr.length;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function std(arr) {
|
|
49
|
+
if (arr.length < 2) return 0;
|
|
50
|
+
const m = mean(arr);
|
|
51
|
+
return Math.sqrt(arr.reduce((s, x) => s + (x - m) ** 2, 0) / (arr.length - 1));
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function cohensD(group1, group2) {
|
|
55
|
+
if (!group1.length || !group2.length) return 0;
|
|
56
|
+
const m1 = mean(group1), m2 = mean(group2);
|
|
57
|
+
const s1 = std(group1), s2 = std(group2);
|
|
58
|
+
const pooled = Math.sqrt(
|
|
59
|
+
((group1.length - 1) * s1 ** 2 + (group2.length - 1) * s2 ** 2)
|
|
60
|
+
/ (group1.length + group2.length - 2)
|
|
61
|
+
);
|
|
62
|
+
return pooled > 0 ? (m1 - m2) / pooled : 0;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function welchTTest(group1, group2) {
|
|
66
|
+
if (group1.length < 2 || group2.length < 2) return { t: 0, df: 0, p: 1 };
|
|
67
|
+
const m1 = mean(group1), m2 = mean(group2);
|
|
68
|
+
const v1 = std(group1) ** 2, v2 = std(group2) ** 2;
|
|
69
|
+
const n1 = group1.length, n2 = group2.length;
|
|
70
|
+
const se = Math.sqrt(v1 / n1 + v2 / n2);
|
|
71
|
+
if (se === 0) return { t: 0, df: n1 + n2 - 2, p: 1 };
|
|
72
|
+
const t = (m1 - m2) / se;
|
|
73
|
+
const num = (v1 / n1 + v2 / n2) ** 2;
|
|
74
|
+
const den = (v1 / n1) ** 2 / (n1 - 1) + (v2 / n2) ** 2 / (n2 - 1);
|
|
75
|
+
const df = den > 0 ? num / den : n1 + n2 - 2;
|
|
76
|
+
const p = tTestPValue(Math.abs(t), df);
|
|
77
|
+
return { t, df, p };
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
function tTestPValue(t, df) {
|
|
81
|
+
// Approximate two-tailed p-value using normal approximation for large df
|
|
82
|
+
if (df <= 0) return 1;
|
|
83
|
+
if (df > 30) {
|
|
84
|
+
return 2 * (1 - normalCDF(Math.abs(t)));
|
|
85
|
+
}
|
|
86
|
+
// For small df, use a rough beta-function based approximation
|
|
87
|
+
const x = df / (df + t * t);
|
|
88
|
+
return regularizedBeta(x, df / 2, 0.5);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function regularizedBeta(x, a, b) {
|
|
92
|
+
// Simple continued-fraction approximation for the regularized incomplete beta
|
|
93
|
+
if (x <= 0) return 0;
|
|
94
|
+
if (x >= 1) return 1;
|
|
95
|
+
const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
|
|
96
|
+
const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta);
|
|
97
|
+
// Lentz's continued fraction
|
|
98
|
+
let f = 1, c = 1, d = 1 - (a + 1) * x / (a + 1);
|
|
99
|
+
if (Math.abs(d) < 1e-30) d = 1e-30;
|
|
100
|
+
d = 1 / d;
|
|
101
|
+
f = d;
|
|
102
|
+
for (let m = 1; m <= 200; m++) {
|
|
103
|
+
let numerator;
|
|
104
|
+
if (m % 2 === 0) {
|
|
105
|
+
const k = m / 2;
|
|
106
|
+
numerator = k * (b - k) * x / ((a + 2 * k - 1) * (a + 2 * k));
|
|
107
|
+
} else {
|
|
108
|
+
const k = (m - 1) / 2;
|
|
109
|
+
numerator = -(a + k) * (a + b + k) * x / ((a + 2 * k) * (a + 2 * k + 1));
|
|
110
|
+
}
|
|
111
|
+
d = 1 + numerator * d;
|
|
112
|
+
if (Math.abs(d) < 1e-30) d = 1e-30;
|
|
113
|
+
d = 1 / d;
|
|
114
|
+
c = 1 + numerator / c;
|
|
115
|
+
if (Math.abs(c) < 1e-30) c = 1e-30;
|
|
116
|
+
f *= c * d;
|
|
117
|
+
if (Math.abs(c * d - 1) < 1e-8) break;
|
|
118
|
+
}
|
|
119
|
+
return front * f / a;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
function lnGamma(z) {
|
|
123
|
+
// Stirling's approximation
|
|
124
|
+
const c = [76.18009172947146, -86.50532032941677, 24.01409824083091,
|
|
125
|
+
-1.231739572450155, 0.001208650973866179, -0.000005395239384953];
|
|
126
|
+
let x = z, y = z;
|
|
127
|
+
let tmp = x + 5.5;
|
|
128
|
+
tmp -= (x + 0.5) * Math.log(tmp);
|
|
129
|
+
let ser = 1.000000000190015;
|
|
130
|
+
for (let j = 0; j < 6; j++) ser += c[j] / ++y;
|
|
131
|
+
return -tmp + Math.log(2.5066282746310005 * ser / x);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function normalCDF(x) {
|
|
135
|
+
const a1 = 0.254829592, a2 = -0.284496736, a3 = 1.421413741;
|
|
136
|
+
const a4 = -1.453152027, a5 = 1.061405429, p = 0.3275911;
|
|
137
|
+
const sign = x < 0 ? -1 : 1;
|
|
138
|
+
const ax = Math.abs(x) / Math.SQRT2;
|
|
139
|
+
const t = 1 / (1 + p * ax);
|
|
140
|
+
const y = 1 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * Math.exp(-ax * ax);
|
|
141
|
+
return 0.5 * (1 + sign * y);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
function chiSquareTest(observed) {
|
|
145
|
+
const nRows = observed.length;
|
|
146
|
+
const nCols = observed[0].length;
|
|
147
|
+
const rowTotals = observed.map(row => row.reduce((a, b) => a + b, 0));
|
|
148
|
+
const colTotals = [];
|
|
149
|
+
for (let j = 0; j < nCols; j++) {
|
|
150
|
+
colTotals.push(observed.reduce((sum, row) => sum + row[j], 0));
|
|
151
|
+
}
|
|
152
|
+
const grand = rowTotals.reduce((a, b) => a + b, 0);
|
|
153
|
+
if (grand === 0) return { chi2: 0, df: 0, p: 1, cramersV: 0 };
|
|
154
|
+
|
|
155
|
+
let chi2 = 0;
|
|
156
|
+
for (let i = 0; i < nRows; i++) {
|
|
157
|
+
for (let j = 0; j < nCols; j++) {
|
|
158
|
+
const expected = (rowTotals[i] * colTotals[j]) / grand;
|
|
159
|
+
if (expected > 0) {
|
|
160
|
+
chi2 += (observed[i][j] - expected) ** 2 / expected;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
const df = (nRows - 1) * (nCols - 1);
|
|
165
|
+
const k = Math.min(nRows, nCols);
|
|
166
|
+
const cramersV = grand > 0 && k > 1 ? Math.sqrt(chi2 / (grand * (k - 1))) : 0;
|
|
167
|
+
const p = chi2PValue(chi2, df);
|
|
168
|
+
return { chi2, df, p, cramersV };
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
function chi2PValue(x, df) {
|
|
172
|
+
if (df <= 0 || x <= 0) return 1;
|
|
173
|
+
if (df > 2) {
|
|
174
|
+
const cube = 1 - 2 / (9 * df);
|
|
175
|
+
const stdNorm = (Math.pow(x / df, 1 / 3) - cube) / Math.sqrt(2 / (9 * df));
|
|
176
|
+
return 1 - normalCDF(stdNorm);
|
|
177
|
+
}
|
|
178
|
+
if (df === 1) return 2 * (1 - normalCDF(Math.sqrt(x)));
|
|
179
|
+
if (df === 2) return Math.exp(-x / 2);
|
|
180
|
+
return 1;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
function pearsonR(xs, ys) {
|
|
184
|
+
if (xs.length < 3) return { r: 0, p: 1 };
|
|
185
|
+
const n = xs.length;
|
|
186
|
+
const mx = mean(xs), my = mean(ys);
|
|
187
|
+
let num = 0, dx2 = 0, dy2 = 0;
|
|
188
|
+
for (let i = 0; i < n; i++) {
|
|
189
|
+
const dx = xs[i] - mx, dy = ys[i] - my;
|
|
190
|
+
num += dx * dy;
|
|
191
|
+
dx2 += dx * dx;
|
|
192
|
+
dy2 += dy * dy;
|
|
193
|
+
}
|
|
194
|
+
const denom = Math.sqrt(dx2 * dy2);
|
|
195
|
+
if (denom === 0) return { r: 0, p: 1 };
|
|
196
|
+
const r = num / denom;
|
|
197
|
+
const t = r * Math.sqrt((n - 2) / (1 - r * r));
|
|
198
|
+
const p = tTestPValue(Math.abs(t), n - 2);
|
|
199
|
+
return { r, p };
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// ── Model Calls ──────────────────────────────────────────────────────────
|
|
203
|
+
|
|
204
|
+
async function callModel(prompt, modelKey) {
|
|
205
|
+
if (modelKey === 'claude-code') return callClaudeCode(prompt);
|
|
206
|
+
return callOpenRouter(prompt, modelKey);
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
async function callClaudeCode(prompt) {
|
|
210
|
+
const stdout = await new Promise((resolve, reject) => {
|
|
211
|
+
const env = { ...process.env };
|
|
212
|
+
delete env.ANTHROPIC_API_KEY;
|
|
213
|
+
const child = spawn('claude', ['-p', '-', '--output-format', 'text'], {
|
|
214
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
215
|
+
env,
|
|
216
|
+
});
|
|
217
|
+
let out = '';
|
|
218
|
+
let err = '';
|
|
219
|
+
child.stdout.on('data', d => { out += d; });
|
|
220
|
+
child.stderr.on('data', d => { err += d; });
|
|
221
|
+
child.on('error', e => reject(new Error(`Failed to spawn claude: ${e.message}`)));
|
|
222
|
+
child.on('close', code => {
|
|
223
|
+
if (code !== 0) reject(new Error(err || out || `claude exited with code ${code}`));
|
|
224
|
+
else resolve(out);
|
|
225
|
+
});
|
|
226
|
+
child.stdin.write(prompt);
|
|
227
|
+
child.stdin.end();
|
|
228
|
+
});
|
|
229
|
+
return stdout.trim();
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
async function callOpenRouter(prompt, modelKey) {
|
|
233
|
+
const apiKey = process.env.OPENROUTER_API_KEY;
|
|
234
|
+
if (!apiKey) throw new Error('OPENROUTER_API_KEY not set');
|
|
235
|
+
const model = MODEL_MAP[modelKey];
|
|
236
|
+
if (!model) throw new Error(`Unknown model: ${modelKey}`);
|
|
237
|
+
|
|
238
|
+
const controller = new AbortController();
|
|
239
|
+
const timeout = setTimeout(() => controller.abort(), 120000);
|
|
240
|
+
try {
|
|
241
|
+
const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
|
|
242
|
+
method: 'POST',
|
|
243
|
+
headers: {
|
|
244
|
+
'Content-Type': 'application/json',
|
|
245
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
246
|
+
},
|
|
247
|
+
body: JSON.stringify({
|
|
248
|
+
model,
|
|
249
|
+
max_tokens: 2000,
|
|
250
|
+
temperature: 0.1,
|
|
251
|
+
include_reasoning: false,
|
|
252
|
+
response_format: { type: 'json_object' },
|
|
253
|
+
messages: [{ role: 'user', content: prompt }],
|
|
254
|
+
}),
|
|
255
|
+
signal: controller.signal,
|
|
256
|
+
});
|
|
257
|
+
clearTimeout(timeout);
|
|
258
|
+
if (!res.ok) {
|
|
259
|
+
const body = await res.text();
|
|
260
|
+
throw new Error(`OpenRouter ${res.status}: ${body.slice(0, 200)}`);
|
|
261
|
+
}
|
|
262
|
+
const data = await res.json();
|
|
263
|
+
const content = data.choices?.[0]?.message?.content;
|
|
264
|
+
if (!content) throw new Error('No content in response');
|
|
265
|
+
return content;
|
|
266
|
+
} catch (err) {
|
|
267
|
+
clearTimeout(timeout);
|
|
268
|
+
throw err;
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
function parseJsonResponse(content) {
|
|
273
|
+
try {
|
|
274
|
+
return JSON.parse(content);
|
|
275
|
+
} catch {
|
|
276
|
+
const match = content.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
277
|
+
if (match) return JSON.parse(match[1].trim());
|
|
278
|
+
throw new Error(`Failed to parse JSON: ${content.slice(0, 300)}`);
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// ── Data Loading ─────────────────────────────────────────────────────────
|
|
283
|
+
|
|
284
|
+
function loadRows(db, runId) {
|
|
285
|
+
return db.prepare(`
|
|
286
|
+
SELECT id, dialogue_id, scenario_id, profile_name, overall_score,
|
|
287
|
+
dialogue_rounds, suggestions
|
|
288
|
+
FROM evaluation_results
|
|
289
|
+
WHERE run_id = ? AND success = 1 AND dialogue_id IS NOT NULL
|
|
290
|
+
ORDER BY profile_name, id
|
|
291
|
+
`).all(runId);
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
function parseCondition(profileName) {
|
|
295
|
+
const isRecognition = profileName.includes('recog');
|
|
296
|
+
let persona = 'unknown';
|
|
297
|
+
if (profileName.includes('suspicious')) persona = 'suspicious';
|
|
298
|
+
else if (profileName.includes('adversary')) persona = 'adversary';
|
|
299
|
+
else if (profileName.includes('advocate')) persona = 'advocate';
|
|
300
|
+
return {
|
|
301
|
+
condition: isRecognition ? 'recognition' : 'base',
|
|
302
|
+
persona,
|
|
303
|
+
};
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
function loadDialogueLog(dialogueId) {
|
|
307
|
+
const logPath = path.join(process.cwd(), 'logs', 'tutor-dialogues', `${dialogueId}.json`);
|
|
308
|
+
if (!fs.existsSync(logPath)) return null;
|
|
309
|
+
return JSON.parse(fs.readFileSync(logPath, 'utf8'));
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
// ── Tier 1: Structural Metrics ───────────────────────────────────────────
|
|
313
|
+
|
|
314
|
+
/**
|
|
315
|
+
* Segments dialogueTrace into external turns, delimited by final_output entries.
|
|
316
|
+
* Each turn contains the ego-superego negotiation entries plus the learner action.
|
|
317
|
+
*/
|
|
318
|
+
function segmentTraceByTurn(dialogueTrace) {
|
|
319
|
+
const turns = [];
|
|
320
|
+
let currentEntries = [];
|
|
321
|
+
|
|
322
|
+
for (const entry of dialogueTrace) {
|
|
323
|
+
if (entry.action === 'final_output') {
|
|
324
|
+
turns.push({
|
|
325
|
+
turnIndex: entry.turnIndex != null ? entry.turnIndex : turns.length,
|
|
326
|
+
entries: currentEntries,
|
|
327
|
+
finalOutput: entry,
|
|
328
|
+
});
|
|
329
|
+
currentEntries = [];
|
|
330
|
+
} else {
|
|
331
|
+
currentEntries.push(entry);
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// If no final_output markers found, treat the entire trace as a single turn
|
|
336
|
+
if (turns.length === 0 && currentEntries.length > 0) {
|
|
337
|
+
turns.push({
|
|
338
|
+
turnIndex: 0,
|
|
339
|
+
entries: currentEntries,
|
|
340
|
+
finalOutput: null,
|
|
341
|
+
});
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// Enrich each turn with parsed sub-components
|
|
345
|
+
return turns.map(turn => {
|
|
346
|
+
const superegoEntries = turn.entries.filter(e => e.agent === 'superego');
|
|
347
|
+
const egoEntries = turn.entries.filter(e => e.agent === 'ego');
|
|
348
|
+
const learnerAction = turn.entries.find(e => e.action === 'turn_action');
|
|
349
|
+
const contextInput = turn.entries.find(e => e.action === 'context_input');
|
|
350
|
+
|
|
351
|
+
return {
|
|
352
|
+
turnIndex: turn.turnIndex,
|
|
353
|
+
superegoEntries,
|
|
354
|
+
egoEntries,
|
|
355
|
+
learnerAction,
|
|
356
|
+
contextInput,
|
|
357
|
+
allEntries: turn.entries,
|
|
358
|
+
finalOutput: turn.finalOutput,
|
|
359
|
+
};
|
|
360
|
+
});
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
function extractStructuralMetrics(dialogueLog, turns) {
|
|
364
|
+
const metrics = {
|
|
365
|
+
totalTurns: turns.length,
|
|
366
|
+
perTurn: [],
|
|
367
|
+
aggregate: {},
|
|
368
|
+
};
|
|
369
|
+
|
|
370
|
+
// Per-turn metrics
|
|
371
|
+
for (const turn of turns) {
|
|
372
|
+
const rejections = turn.superegoEntries.filter(e => e.approved === false);
|
|
373
|
+
const approvals = turn.superegoEntries.filter(e => e.approved === true);
|
|
374
|
+
const confidences = turn.superegoEntries
|
|
375
|
+
.map(e => e.confidence)
|
|
376
|
+
.filter(c => c != null);
|
|
377
|
+
|
|
378
|
+
// Intervention type distribution for this turn
|
|
379
|
+
const interventionTypes = {};
|
|
380
|
+
for (const se of turn.superegoEntries) {
|
|
381
|
+
const it = se.interventionType || 'unknown';
|
|
382
|
+
interventionTypes[it] = (interventionTypes[it] || 0) + 1;
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
// Rounds to convergence: number of ego-superego exchanges before final_output
|
|
386
|
+
const roundsToConverge = turn.superegoEntries.length;
|
|
387
|
+
|
|
388
|
+
// Ego suggestion changes: track actionType/actionTarget shifts across revisions
|
|
389
|
+
const egoSuggestionTypes = turn.egoEntries
|
|
390
|
+
.filter(e => e.suggestions && e.suggestions[0])
|
|
391
|
+
.map(e => ({
|
|
392
|
+
actionType: e.suggestions[0].actionType,
|
|
393
|
+
actionTarget: e.suggestions[0].actionTarget,
|
|
394
|
+
type: e.suggestions[0].type,
|
|
395
|
+
}));
|
|
396
|
+
|
|
397
|
+
const typeShifts = countShifts(egoSuggestionTypes.map(s => `${s.actionType}:${s.actionTarget}`));
|
|
398
|
+
|
|
399
|
+
// Learner action for this turn
|
|
400
|
+
const learnerDetail = turn.learnerAction?.detail || null;
|
|
401
|
+
|
|
402
|
+
metrics.perTurn.push({
|
|
403
|
+
turnIndex: turn.turnIndex,
|
|
404
|
+
negationDepth: rejections.length,
|
|
405
|
+
approvalCount: approvals.length,
|
|
406
|
+
roundsToConverge,
|
|
407
|
+
confidences,
|
|
408
|
+
meanConfidence: confidences.length > 0 ? mean(confidences) : null,
|
|
409
|
+
interventionTypes,
|
|
410
|
+
suggestionTypeShifts: typeShifts,
|
|
411
|
+
learnerAction: learnerDetail,
|
|
412
|
+
superegoFeedbackLengths: turn.superegoEntries
|
|
413
|
+
.map(e => (e.feedback || '').length)
|
|
414
|
+
.filter(l => l > 0),
|
|
415
|
+
});
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
// Aggregate metrics across all turns
|
|
419
|
+
const allNegationDepths = metrics.perTurn.map(t => t.negationDepth);
|
|
420
|
+
const allRoundsToConverge = metrics.perTurn.map(t => t.roundsToConverge);
|
|
421
|
+
const allConfidences = metrics.perTurn.flatMap(t => t.confidences);
|
|
422
|
+
const allFeedbackLengths = metrics.perTurn.flatMap(t => t.superegoFeedbackLengths);
|
|
423
|
+
|
|
424
|
+
// Confidence trajectory: first turn vs last turn
|
|
425
|
+
const firstTurnConf = metrics.perTurn[0]?.meanConfidence;
|
|
426
|
+
const lastTurnConf = metrics.perTurn.length > 1
|
|
427
|
+
? metrics.perTurn[metrics.perTurn.length - 1]?.meanConfidence
|
|
428
|
+
: null;
|
|
429
|
+
|
|
430
|
+
// Intervention type distribution across all turns
|
|
431
|
+
const totalInterventions = {};
|
|
432
|
+
for (const pt of metrics.perTurn) {
|
|
433
|
+
for (const [type, count] of Object.entries(pt.interventionTypes)) {
|
|
434
|
+
totalInterventions[type] = (totalInterventions[type] || 0) + count;
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
// Learner action trajectory
|
|
439
|
+
const learnerActions = metrics.perTurn
|
|
440
|
+
.map(t => t.learnerAction)
|
|
441
|
+
.filter(Boolean);
|
|
442
|
+
|
|
443
|
+
// Convergence speed trajectory (does negotiation get faster?)
|
|
444
|
+
const convergenceTrajectory = allRoundsToConverge.length > 1
|
|
445
|
+
? allRoundsToConverge[allRoundsToConverge.length - 1] - allRoundsToConverge[0]
|
|
446
|
+
: 0;
|
|
447
|
+
|
|
448
|
+
metrics.aggregate = {
|
|
449
|
+
meanNegationDepth: mean(allNegationDepths),
|
|
450
|
+
totalNegations: allNegationDepths.reduce((a, b) => a + b, 0),
|
|
451
|
+
meanRoundsToConverge: mean(allRoundsToConverge),
|
|
452
|
+
sdRoundsToConverge: std(allRoundsToConverge),
|
|
453
|
+
convergenceTrajectory,
|
|
454
|
+
meanConfidence: allConfidences.length > 0 ? mean(allConfidences) : null,
|
|
455
|
+
confidenceTrajectory: firstTurnConf != null && lastTurnConf != null
|
|
456
|
+
? lastTurnConf - firstTurnConf
|
|
457
|
+
: null,
|
|
458
|
+
totalInterventions,
|
|
459
|
+
meanFeedbackLength: allFeedbackLengths.length > 0 ? mean(allFeedbackLengths) : 0,
|
|
460
|
+
learnerActionSequence: learnerActions,
|
|
461
|
+
};
|
|
462
|
+
|
|
463
|
+
// Incorporation rate from transformationAnalysis if available
|
|
464
|
+
const ta = dialogueLog.transformationAnalysis;
|
|
465
|
+
if (ta?.markerAnalysis) {
|
|
466
|
+
metrics.aggregate.incorporationRate = ta.markerAnalysis;
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
return metrics;
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
function countShifts(sequence) {
|
|
473
|
+
let shifts = 0;
|
|
474
|
+
for (let i = 1; i < sequence.length; i++) {
|
|
475
|
+
if (sequence[i] !== sequence[i - 1]) shifts++;
|
|
476
|
+
}
|
|
477
|
+
return shifts;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
// ── Tier 2: LLM-Coded Metrics ───────────────────────────────────────────
|
|
481
|
+
|
|
482
|
+
function buildStanceReversalPrompt(turns) {
|
|
483
|
+
// Compare consecutive pairs of superego feedback
|
|
484
|
+
const pairs = [];
|
|
485
|
+
for (let i = 0; i < turns.length - 1; i++) {
|
|
486
|
+
const feedbackA = turns[i].superegoEntries
|
|
487
|
+
.map(e => e.feedback)
|
|
488
|
+
.filter(Boolean)
|
|
489
|
+
.join('\n');
|
|
490
|
+
const feedbackB = turns[i + 1].superegoEntries
|
|
491
|
+
.map(e => e.feedback)
|
|
492
|
+
.filter(Boolean)
|
|
493
|
+
.join('\n');
|
|
494
|
+
if (feedbackA && feedbackB) {
|
|
495
|
+
pairs.push({
|
|
496
|
+
turnA: turns[i].turnIndex,
|
|
497
|
+
turnB: turns[i + 1].turnIndex,
|
|
498
|
+
feedbackA: feedbackA.slice(0, 600),
|
|
499
|
+
feedbackB: feedbackB.slice(0, 600),
|
|
500
|
+
});
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
if (pairs.length === 0) return null;
|
|
505
|
+
|
|
506
|
+
const pairsText = pairs.map((p, i) =>
|
|
507
|
+
`### Pair ${i + 1} (Turn ${p.turnA} → Turn ${p.turnB})\n**Turn ${p.turnA} superego feedback:**\n${p.feedbackA}\n\n**Turn ${p.turnB} superego feedback:**\n${p.feedbackB}`
|
|
508
|
+
).join('\n\n');
|
|
509
|
+
|
|
510
|
+
return `You are analyzing ego-superego dialogue traces from an AI tutoring system. The superego reviews and critiques the ego's suggestions across multiple turns of a tutoring conversation.
|
|
511
|
+
|
|
512
|
+
## Task: Detect Stance Reversals
|
|
513
|
+
|
|
514
|
+
For each consecutive pair of superego feedback, determine whether the superego's evaluative stance REVERSED between turns — that is, whether it contradicted or substantially changed its position on what matters in the tutor's response.
|
|
515
|
+
|
|
516
|
+
A stance reversal is NOT just giving different feedback about different content. It means the superego's priorities, values, or evaluative criteria shifted (e.g., first prioritizing emotional validation, then deprioritizing it; first rejecting a pedagogical approach, then endorsing a similar one).
|
|
517
|
+
|
|
518
|
+
${pairsText}
|
|
519
|
+
|
|
520
|
+
## Output Format
|
|
521
|
+
|
|
522
|
+
Return a JSON object:
|
|
523
|
+
{
|
|
524
|
+
"pairs": [
|
|
525
|
+
{
|
|
526
|
+
"turnA": <number>,
|
|
527
|
+
"turnB": <number>,
|
|
528
|
+
"reversed": true|false,
|
|
529
|
+
"directionA": "brief description of superego's stance in turn A (max 15 words)",
|
|
530
|
+
"directionB": "brief description of superego's stance in turn B (max 15 words)",
|
|
531
|
+
"reversal_type": "priority_shift|criteria_change|contradiction|none"
|
|
532
|
+
}
|
|
533
|
+
],
|
|
534
|
+
"total_reversals": <number>
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
Return ONLY the JSON object.`;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
function buildCrossTurnMemoryPrompt(turns) {
|
|
541
|
+
const turnData = [];
|
|
542
|
+
for (let i = 0; i < turns.length; i++) {
|
|
543
|
+
const feedback = turns[i].superegoEntries
|
|
544
|
+
.map(e => e.feedback)
|
|
545
|
+
.filter(Boolean)
|
|
546
|
+
.join('\n');
|
|
547
|
+
const priorTurnSummaries = turns.slice(0, i)
|
|
548
|
+
.map(t => {
|
|
549
|
+
const fb = t.superegoEntries
|
|
550
|
+
.map(e => e.feedback)
|
|
551
|
+
.filter(Boolean)
|
|
552
|
+
.join('; ')
|
|
553
|
+
.slice(0, 200);
|
|
554
|
+
return `Turn ${t.turnIndex}: ${fb || '(no feedback)'}`;
|
|
555
|
+
});
|
|
556
|
+
|
|
557
|
+
if (feedback && priorTurnSummaries.length > 0) {
|
|
558
|
+
turnData.push({
|
|
559
|
+
turnIndex: turns[i].turnIndex,
|
|
560
|
+
feedback: feedback.slice(0, 600),
|
|
561
|
+
priorSummaries: priorTurnSummaries,
|
|
562
|
+
});
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
if (turnData.length === 0) return null;
|
|
567
|
+
|
|
568
|
+
const turnText = turnData.map(t =>
|
|
569
|
+
`### Turn ${t.turnIndex}\n**Prior turns:**\n${t.priorSummaries.join('\n')}\n\n**Current superego feedback:**\n${t.feedback}`
|
|
570
|
+
).join('\n\n');
|
|
571
|
+
|
|
572
|
+
return `You are analyzing ego-superego dialogue traces from an AI tutoring system. The superego reviews the ego's suggestions across multiple external turns.
|
|
573
|
+
|
|
574
|
+
## Task: Detect Cross-Turn Memory References
|
|
575
|
+
|
|
576
|
+
For each turn's superego feedback, determine whether it explicitly or implicitly references feedback, decisions, or content from prior turns. This measures whether the superego maintains coherent memory across the dialogue.
|
|
577
|
+
|
|
578
|
+
Types of references:
|
|
579
|
+
- **explicit_reference**: Directly mentions a prior turn's decision or content
|
|
580
|
+
- **implicit_callback**: Uses phrasing or criteria that echo prior feedback
|
|
581
|
+
- **escalation**: Builds on prior criticism (e.g., "still not addressing...")
|
|
582
|
+
- **reversal_acknowledgment**: Notes a change from prior approach
|
|
583
|
+
|
|
584
|
+
${turnText}
|
|
585
|
+
|
|
586
|
+
## Output Format
|
|
587
|
+
|
|
588
|
+
Return a JSON object:
|
|
589
|
+
{
|
|
590
|
+
"turns": [
|
|
591
|
+
{
|
|
592
|
+
"turnIndex": <number>,
|
|
593
|
+
"references_prior": true|false,
|
|
594
|
+
"reference_types": ["explicit_reference"|"implicit_callback"|"escalation"|"reversal_acknowledgment"],
|
|
595
|
+
"evidence": "brief quote or description (max 20 words)"
|
|
596
|
+
}
|
|
597
|
+
],
|
|
598
|
+
"total_references": <number>,
|
|
599
|
+
"memory_rate": <0-1 fraction of turns with cross-turn references>
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
Return ONLY the JSON object.`;
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
function buildHallucinationCorrectionPrompt(turns) {
|
|
606
|
+
const rejections = [];
|
|
607
|
+
for (const turn of turns) {
|
|
608
|
+
const ctx = turn.contextInput?.rawContext?.slice(0, 400) || '(no context)';
|
|
609
|
+
for (const se of turn.superegoEntries) {
|
|
610
|
+
if (se.approved === false && se.feedback) {
|
|
611
|
+
// Get the ego suggestion that was rejected
|
|
612
|
+
const egoIdx = turn.allEntries.indexOf(se) - 1;
|
|
613
|
+
const egoEntry = egoIdx >= 0 ? turn.allEntries[egoIdx] : null;
|
|
614
|
+
const egoMessage = egoEntry?.suggestions?.[0]?.message?.slice(0, 400) || '(no suggestion)';
|
|
615
|
+
|
|
616
|
+
rejections.push({
|
|
617
|
+
turnIndex: turn.turnIndex,
|
|
618
|
+
round: se.round,
|
|
619
|
+
egoSuggestion: egoMessage,
|
|
620
|
+
superegoFeedback: se.feedback.slice(0, 400),
|
|
621
|
+
context: ctx,
|
|
622
|
+
});
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
if (rejections.length === 0) return null;
|
|
628
|
+
|
|
629
|
+
// Limit to first 6 rejections to keep prompt manageable
|
|
630
|
+
const subset = rejections.slice(0, 6);
|
|
631
|
+
|
|
632
|
+
const rejectText = subset.map((r, i) =>
|
|
633
|
+
`### Rejection ${i + 1} (Turn ${r.turnIndex}, Round ${r.round})\n**Context:** ${r.context}\n**Ego suggestion:** ${r.egoSuggestion}\n**Superego rejection:** ${r.superegoFeedback}`
|
|
634
|
+
).join('\n\n');
|
|
635
|
+
|
|
636
|
+
return `You are analyzing ego-superego dialogue traces from an AI tutoring system. The superego sometimes rejects the ego's suggestions.
|
|
637
|
+
|
|
638
|
+
## Task: Detect Hallucination Corrections
|
|
639
|
+
|
|
640
|
+
For each rejection, determine whether the superego is correcting a "hallucination" — a case where the ego fabricated, misrepresented, or ignored factual information from the learner context. Types of hallucination:
|
|
641
|
+
- **context_fabrication**: Ego claims learner said/did something not in the context
|
|
642
|
+
- **context_omission**: Ego ignores explicit learner signals present in context
|
|
643
|
+
- **metric_misuse**: Ego references metrics (struggle signals, sessions) inaccurately
|
|
644
|
+
- **repetition_blindness**: Ego repeats a suggestion that already failed in a prior turn
|
|
645
|
+
|
|
646
|
+
Not all rejections are hallucination corrections — the superego may reject for tone, pedagogy, or framing reasons without detecting hallucination. Code only genuine factual corrections.
|
|
647
|
+
|
|
648
|
+
${rejectText}
|
|
649
|
+
|
|
650
|
+
## Output Format
|
|
651
|
+
|
|
652
|
+
Return a JSON object:
|
|
653
|
+
{
|
|
654
|
+
"rejections": [
|
|
655
|
+
{
|
|
656
|
+
"turnIndex": <number>,
|
|
657
|
+
"round": <number>,
|
|
658
|
+
"hallucination_detected": true|false,
|
|
659
|
+
"types": ["context_fabrication"|"context_omission"|"metric_misuse"|"repetition_blindness"],
|
|
660
|
+
"description": "brief description (max 20 words)"
|
|
661
|
+
}
|
|
662
|
+
],
|
|
663
|
+
"total_hallucinations": <number>,
|
|
664
|
+
"hallucination_rate": <0-1 fraction of rejections containing hallucination>
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
Return ONLY the JSON object.`;
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
function buildPhaseTransitionPrompt(turns) {
|
|
671
|
+
// Build the learner message sequence + superego response characterization
|
|
672
|
+
const sequence = [];
|
|
673
|
+
for (const turn of turns) {
|
|
674
|
+
const learnerMsg = turn.learnerAction?.contextSummary
|
|
675
|
+
|| turn.contextInput?.rawContext?.slice(0, 200)
|
|
676
|
+
|| '(initial turn)';
|
|
677
|
+
const superegoStance = turn.superegoEntries
|
|
678
|
+
.filter(e => e.feedback)
|
|
679
|
+
.map(e => e.feedback.slice(0, 200))
|
|
680
|
+
.join(' | ');
|
|
681
|
+
const learnerDetail = turn.learnerAction?.detail || 'initial';
|
|
682
|
+
|
|
683
|
+
sequence.push({
|
|
684
|
+
turnIndex: turn.turnIndex,
|
|
685
|
+
learnerMessage: typeof learnerMsg === 'string' ? learnerMsg.slice(0, 300) : '(no message)',
|
|
686
|
+
learnerAction: learnerDetail,
|
|
687
|
+
superegoStance: superegoStance.slice(0, 400) || '(no superego feedback)',
|
|
688
|
+
});
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
if (sequence.length < 2) return null;
|
|
692
|
+
|
|
693
|
+
const seqText = sequence.map(s =>
|
|
694
|
+
`### Turn ${s.turnIndex}\n**Learner** [${s.learnerAction}]: ${s.learnerMessage}\n**Superego stance:** ${s.superegoStance}`
|
|
695
|
+
).join('\n\n');
|
|
696
|
+
|
|
697
|
+
return `You are analyzing multi-turn ego-superego tutoring dialogues. The learner interacts with a tutor over multiple turns, and the superego reviews each of the tutor's responses.
|
|
698
|
+
|
|
699
|
+
## Task: Detect Phase Transitions
|
|
700
|
+
|
|
701
|
+
A phase transition occurs when the dialogue qualitatively shifts — the learner's engagement mode changes, the superego's evaluative priorities pivot, or the ego-superego dynamic fundamentally reorganizes. Types:
|
|
702
|
+
- **learner_mode_shift**: Learner moves from confusion to engagement, resistance to curiosity, etc.
|
|
703
|
+
- **superego_priority_pivot**: Superego shifts primary concern (e.g., from tone to content accuracy)
|
|
704
|
+
- **negotiation_pattern_change**: Ego-superego dynamic changes (e.g., from adversarial to cooperative)
|
|
705
|
+
- **pedagogical_escalation**: Tutor approach fundamentally changes strategy (review → practice, explain → scaffold)
|
|
706
|
+
|
|
707
|
+
## Dialogue Sequence
|
|
708
|
+
|
|
709
|
+
${seqText}
|
|
710
|
+
|
|
711
|
+
## Output Format
|
|
712
|
+
|
|
713
|
+
Return a JSON object:
|
|
714
|
+
{
|
|
715
|
+
"transitions": [
|
|
716
|
+
{
|
|
717
|
+
"between_turns": [<turnA>, <turnB>],
|
|
718
|
+
"shift_type": "learner_mode_shift|superego_priority_pivot|negotiation_pattern_change|pedagogical_escalation",
|
|
719
|
+
"description": "brief description (max 20 words)",
|
|
720
|
+
"superego_adapts": true|false
|
|
721
|
+
}
|
|
722
|
+
],
|
|
723
|
+
"total_transitions": <number>,
|
|
724
|
+
"transition_density": <transitions per inter-turn gap>
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
Return ONLY the JSON object.`;
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
async function extractLLMCodedMetrics(turns, modelKey) {
|
|
731
|
+
const llmMetrics = {
|
|
732
|
+
stanceReversal: null,
|
|
733
|
+
crossTurnMemory: null,
|
|
734
|
+
hallucinationCorrection: null,
|
|
735
|
+
phaseTransition: null,
|
|
736
|
+
errors: [],
|
|
737
|
+
};
|
|
738
|
+
|
|
739
|
+
const prompts = [
|
|
740
|
+
{ key: 'stanceReversal', builder: buildStanceReversalPrompt },
|
|
741
|
+
{ key: 'crossTurnMemory', builder: buildCrossTurnMemoryPrompt },
|
|
742
|
+
{ key: 'hallucinationCorrection', builder: buildHallucinationCorrectionPrompt },
|
|
743
|
+
{ key: 'phaseTransition', builder: buildPhaseTransitionPrompt },
|
|
744
|
+
];
|
|
745
|
+
|
|
746
|
+
for (const { key, builder } of prompts) {
|
|
747
|
+
const prompt = builder(turns);
|
|
748
|
+
if (!prompt) {
|
|
749
|
+
llmMetrics[key] = { skipped: true, reason: 'insufficient data' };
|
|
750
|
+
continue;
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
try {
|
|
754
|
+
const content = await callModel(prompt, modelKey);
|
|
755
|
+
llmMetrics[key] = parseJsonResponse(content);
|
|
756
|
+
} catch (err) {
|
|
757
|
+
llmMetrics.errors.push({ metric: key, error: err.message });
|
|
758
|
+
llmMetrics[key] = { error: err.message };
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
return llmMetrics;
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
// ── Profile Assembly ─────────────────────────────────────────────────────
|
|
766
|
+
|
|
767
|
+
function buildModulationProfile(row, dialogueLog, structural, llmCoded) {
|
|
768
|
+
const { condition, persona } = parseCondition(row.profile_name);
|
|
769
|
+
return {
|
|
770
|
+
id: row.id,
|
|
771
|
+
dialogueId: row.dialogue_id,
|
|
772
|
+
scenarioId: row.scenario_id,
|
|
773
|
+
profileName: row.profile_name,
|
|
774
|
+
condition,
|
|
775
|
+
persona,
|
|
776
|
+
overallScore: row.overall_score,
|
|
777
|
+
dialogueRounds: row.dialogue_rounds,
|
|
778
|
+
structural,
|
|
779
|
+
llmCoded: llmCoded || null,
|
|
780
|
+
};
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
// ── Aggregate Analysis ───────────────────────────────────────────────────
|
|
784
|
+
|
|
785
|
+
function analyzeAggregateResults(profiles) {
|
|
786
|
+
const analysis = {
|
|
787
|
+
n: profiles.length,
|
|
788
|
+
byCondition: { base: [], recognition: [] },
|
|
789
|
+
byPersona: {},
|
|
790
|
+
byCell: {},
|
|
791
|
+
structural: {},
|
|
792
|
+
llmCoded: {},
|
|
793
|
+
correlations: {},
|
|
794
|
+
};
|
|
795
|
+
|
|
796
|
+
// Group profiles
|
|
797
|
+
for (const p of profiles) {
|
|
798
|
+
analysis.byCondition[p.condition].push(p);
|
|
799
|
+
if (!analysis.byPersona[p.persona]) analysis.byPersona[p.persona] = [];
|
|
800
|
+
analysis.byPersona[p.persona].push(p);
|
|
801
|
+
const cellKey = `${p.condition}_${p.persona}`;
|
|
802
|
+
if (!analysis.byCell[cellKey]) analysis.byCell[cellKey] = [];
|
|
803
|
+
analysis.byCell[cellKey].push(p);
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
// ── Structural Metric Comparisons ──────────────────────────────────
|
|
807
|
+
|
|
808
|
+
const structuralMetrics = [
|
|
809
|
+
{ key: 'meanNegationDepth', label: 'Mean Negation Depth', extract: p => p.structural.aggregate.meanNegationDepth },
|
|
810
|
+
{ key: 'totalNegations', label: 'Total Negations', extract: p => p.structural.aggregate.totalNegations },
|
|
811
|
+
{ key: 'meanRoundsToConverge', label: 'Mean Rounds to Converge', extract: p => p.structural.aggregate.meanRoundsToConverge },
|
|
812
|
+
{ key: 'convergenceTrajectory', label: 'Convergence Trajectory', extract: p => p.structural.aggregate.convergenceTrajectory },
|
|
813
|
+
{ key: 'meanConfidence', label: 'Mean Superego Confidence', extract: p => p.structural.aggregate.meanConfidence },
|
|
814
|
+
{ key: 'confidenceTrajectory', label: 'Confidence Trajectory', extract: p => p.structural.aggregate.confidenceTrajectory },
|
|
815
|
+
{ key: 'meanFeedbackLength', label: 'Mean Feedback Length', extract: p => p.structural.aggregate.meanFeedbackLength },
|
|
816
|
+
];
|
|
817
|
+
|
|
818
|
+
for (const metric of structuralMetrics) {
|
|
819
|
+
const baseVals = analysis.byCondition.base.map(metric.extract).filter(v => v != null);
|
|
820
|
+
const recogVals = analysis.byCondition.recognition.map(metric.extract).filter(v => v != null);
|
|
821
|
+
|
|
822
|
+
analysis.structural[metric.key] = {
|
|
823
|
+
label: metric.label,
|
|
824
|
+
base: { n: baseVals.length, mean: mean(baseVals), sd: std(baseVals) },
|
|
825
|
+
recognition: { n: recogVals.length, mean: mean(recogVals), sd: std(recogVals) },
|
|
826
|
+
d: cohensD(recogVals, baseVals),
|
|
827
|
+
welch: welchTTest(recogVals, baseVals),
|
|
828
|
+
};
|
|
829
|
+
|
|
830
|
+
// Per-persona breakdown
|
|
831
|
+
const byPersona = {};
|
|
832
|
+
for (const [persona, pProfiles] of Object.entries(analysis.byPersona)) {
|
|
833
|
+
const baseP = pProfiles.filter(p => p.condition === 'base').map(metric.extract).filter(v => v != null);
|
|
834
|
+
const recogP = pProfiles.filter(p => p.condition === 'recognition').map(metric.extract).filter(v => v != null);
|
|
835
|
+
byPersona[persona] = {
|
|
836
|
+
base: { n: baseP.length, mean: mean(baseP), sd: std(baseP) },
|
|
837
|
+
recognition: { n: recogP.length, mean: mean(recogP), sd: std(recogP) },
|
|
838
|
+
d: cohensD(recogP, baseP),
|
|
839
|
+
};
|
|
840
|
+
}
|
|
841
|
+
analysis.structural[metric.key].byPersona = byPersona;
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
// ── Intervention Type Distribution (Categorical) ────────────────────
|
|
845
|
+
|
|
846
|
+
const interventionCounts = { base: {}, recognition: {} };
|
|
847
|
+
for (const p of profiles) {
|
|
848
|
+
const itd = p.structural.aggregate.totalInterventions;
|
|
849
|
+
for (const [type, count] of Object.entries(itd)) {
|
|
850
|
+
interventionCounts[p.condition][type] = (interventionCounts[p.condition][type] || 0) + count;
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
analysis.structural.interventionDistribution = interventionCounts;
|
|
854
|
+
|
|
855
|
+
// Chi-square on intervention types
|
|
856
|
+
const allIntTypes = [...new Set([
|
|
857
|
+
...Object.keys(interventionCounts.base),
|
|
858
|
+
...Object.keys(interventionCounts.recognition),
|
|
859
|
+
])];
|
|
860
|
+
if (allIntTypes.length > 1) {
|
|
861
|
+
const observed = allIntTypes.map(t => [
|
|
862
|
+
interventionCounts.base[t] || 0,
|
|
863
|
+
interventionCounts.recognition[t] || 0,
|
|
864
|
+
]);
|
|
865
|
+
analysis.structural.interventionChiSquare = {
|
|
866
|
+
...chiSquareTest(observed),
|
|
867
|
+
types: allIntTypes,
|
|
868
|
+
};
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
// ── Learner Action Trajectory (Categorical) ─────────────────────────
|
|
872
|
+
|
|
873
|
+
const learnerActionCounts = { base: {}, recognition: {} };
|
|
874
|
+
for (const p of profiles) {
|
|
875
|
+
for (const action of p.structural.aggregate.learnerActionSequence) {
|
|
876
|
+
const normalized = action.replace(/^Learner:\s*/, '');
|
|
877
|
+
learnerActionCounts[p.condition][normalized] =
|
|
878
|
+
(learnerActionCounts[p.condition][normalized] || 0) + 1;
|
|
879
|
+
}
|
|
880
|
+
}
|
|
881
|
+
analysis.structural.learnerActionDistribution = learnerActionCounts;
|
|
882
|
+
|
|
883
|
+
// ── LLM-Coded Metric Aggregations ──────────────────────────────────
|
|
884
|
+
|
|
885
|
+
if (profiles[0]?.llmCoded && !profiles[0].llmCoded.stanceReversal?.skipped) {
|
|
886
|
+
// Stance reversals
|
|
887
|
+
const baseReversals = analysis.byCondition.base
|
|
888
|
+
.map(p => p.llmCoded?.stanceReversal?.total_reversals)
|
|
889
|
+
.filter(v => v != null);
|
|
890
|
+
const recogReversals = analysis.byCondition.recognition
|
|
891
|
+
.map(p => p.llmCoded?.stanceReversal?.total_reversals)
|
|
892
|
+
.filter(v => v != null);
|
|
893
|
+
analysis.llmCoded.stanceReversal = {
|
|
894
|
+
base: { n: baseReversals.length, mean: mean(baseReversals), sd: std(baseReversals) },
|
|
895
|
+
recognition: { n: recogReversals.length, mean: mean(recogReversals), sd: std(recogReversals) },
|
|
896
|
+
d: cohensD(recogReversals, baseReversals),
|
|
897
|
+
welch: welchTTest(recogReversals, baseReversals),
|
|
898
|
+
};
|
|
899
|
+
|
|
900
|
+
// Cross-turn memory
|
|
901
|
+
const baseMemory = analysis.byCondition.base
|
|
902
|
+
.map(p => p.llmCoded?.crossTurnMemory?.memory_rate)
|
|
903
|
+
.filter(v => v != null);
|
|
904
|
+
const recogMemory = analysis.byCondition.recognition
|
|
905
|
+
.map(p => p.llmCoded?.crossTurnMemory?.memory_rate)
|
|
906
|
+
.filter(v => v != null);
|
|
907
|
+
analysis.llmCoded.crossTurnMemory = {
|
|
908
|
+
base: { n: baseMemory.length, mean: mean(baseMemory), sd: std(baseMemory) },
|
|
909
|
+
recognition: { n: recogMemory.length, mean: mean(recogMemory), sd: std(recogMemory) },
|
|
910
|
+
d: cohensD(recogMemory, baseMemory),
|
|
911
|
+
welch: welchTTest(recogMemory, baseMemory),
|
|
912
|
+
};
|
|
913
|
+
|
|
914
|
+
// Hallucination rate
|
|
915
|
+
const baseHalluc = analysis.byCondition.base
|
|
916
|
+
.map(p => p.llmCoded?.hallucinationCorrection?.hallucination_rate)
|
|
917
|
+
.filter(v => v != null);
|
|
918
|
+
const recogHalluc = analysis.byCondition.recognition
|
|
919
|
+
.map(p => p.llmCoded?.hallucinationCorrection?.hallucination_rate)
|
|
920
|
+
.filter(v => v != null);
|
|
921
|
+
analysis.llmCoded.hallucinationCorrection = {
|
|
922
|
+
base: { n: baseHalluc.length, mean: mean(baseHalluc), sd: std(baseHalluc) },
|
|
923
|
+
recognition: { n: recogHalluc.length, mean: mean(recogHalluc), sd: std(recogHalluc) },
|
|
924
|
+
d: cohensD(recogHalluc, baseHalluc),
|
|
925
|
+
welch: welchTTest(recogHalluc, baseHalluc),
|
|
926
|
+
};
|
|
927
|
+
|
|
928
|
+
// Phase transitions
|
|
929
|
+
const basePhase = analysis.byCondition.base
|
|
930
|
+
.map(p => p.llmCoded?.phaseTransition?.transition_density)
|
|
931
|
+
.filter(v => v != null);
|
|
932
|
+
const recogPhase = analysis.byCondition.recognition
|
|
933
|
+
.map(p => p.llmCoded?.phaseTransition?.transition_density)
|
|
934
|
+
.filter(v => v != null);
|
|
935
|
+
analysis.llmCoded.phaseTransition = {
|
|
936
|
+
base: { n: basePhase.length, mean: mean(basePhase), sd: std(basePhase) },
|
|
937
|
+
recognition: { n: recogPhase.length, mean: mean(recogPhase), sd: std(recogPhase) },
|
|
938
|
+
d: cohensD(recogPhase, basePhase),
|
|
939
|
+
welch: welchTTest(recogPhase, basePhase),
|
|
940
|
+
};
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
// ── Correlations: modulation metrics vs overall_score ──────────────
|
|
944
|
+
|
|
945
|
+
const scores = profiles.map(p => p.overallScore).filter(v => v != null);
|
|
946
|
+
const negDepths = profiles.map(p => p.structural.aggregate.meanNegationDepth);
|
|
947
|
+
const convergeSpeeds = profiles.map(p => p.structural.aggregate.meanRoundsToConverge);
|
|
948
|
+
const feedbackLens = profiles.map(p => p.structural.aggregate.meanFeedbackLength);
|
|
949
|
+
|
|
950
|
+
if (scores.length >= 5) {
|
|
951
|
+
analysis.correlations.negationDepth_score = pearsonR(
|
|
952
|
+
profiles.filter(p => p.overallScore != null).map(p => p.structural.aggregate.meanNegationDepth),
|
|
953
|
+
profiles.filter(p => p.overallScore != null).map(p => p.overallScore)
|
|
954
|
+
);
|
|
955
|
+
analysis.correlations.convergenceSpeed_score = pearsonR(
|
|
956
|
+
profiles.filter(p => p.overallScore != null).map(p => p.structural.aggregate.meanRoundsToConverge),
|
|
957
|
+
profiles.filter(p => p.overallScore != null).map(p => p.overallScore)
|
|
958
|
+
);
|
|
959
|
+
analysis.correlations.feedbackLength_score = pearsonR(
|
|
960
|
+
profiles.filter(p => p.overallScore != null).map(p => p.structural.aggregate.meanFeedbackLength),
|
|
961
|
+
profiles.filter(p => p.overallScore != null).map(p => p.overallScore)
|
|
962
|
+
);
|
|
963
|
+
}
|
|
964
|
+
|
|
965
|
+
return analysis;
|
|
966
|
+
}
|
|
967
|
+
|
|
968
|
+
// ── Report Generation ────────────────────────────────────────────────────
|
|
969
|
+
|
|
970
|
+
function generateReport(profiles, analysis, opts) {
|
|
971
|
+
const timestamp = new Date().toISOString();
|
|
972
|
+
const baseN = analysis.byCondition.base.length;
|
|
973
|
+
const recogN = analysis.byCondition.recognition.length;
|
|
974
|
+
|
|
975
|
+
let md = `# Dialectical Modulation Coding Analysis
|
|
976
|
+
|
|
977
|
+
**Generated:** ${timestamp}
|
|
978
|
+
**Run ID:** ${opts.runId}
|
|
979
|
+
**N:** ${analysis.n} dialogues (base=${baseN}, recognition=${recogN})
|
|
980
|
+
**Personas:** ${Object.keys(analysis.byPersona).join(', ')}
|
|
981
|
+
**Scenarios:** ${[...new Set(profiles.map(p => p.scenarioId))].join(', ')}
|
|
982
|
+
**Model:** ${opts.model}
|
|
983
|
+
**Mode:** ${opts.structuralOnly ? 'structural only' : 'full (structural + LLM-coded)'}
|
|
984
|
+
|
|
985
|
+
## 1. Structural Metrics (Tier 1)
|
|
986
|
+
|
|
987
|
+
### 1.1 Condition Comparison: Base vs Recognition
|
|
988
|
+
|
|
989
|
+
| Metric | Base (N=${baseN}) | Recog (N=${recogN}) | Cohen's d | Welch t | p |
|
|
990
|
+
|--------|-----------|-------------|-----------|---------|---|
|
|
991
|
+
`;
|
|
992
|
+
|
|
993
|
+
for (const [key, data] of Object.entries(analysis.structural)) {
|
|
994
|
+
if (key === 'interventionDistribution' || key === 'interventionChiSquare' || key === 'learnerActionDistribution') continue;
|
|
995
|
+
const bStr = `${data.base.mean.toFixed(2)} (${data.base.sd.toFixed(2)})`;
|
|
996
|
+
const rStr = `${data.recognition.mean.toFixed(2)} (${data.recognition.sd.toFixed(2)})`;
|
|
997
|
+
const pStr = data.welch.p < 0.001 ? '<.001' : data.welch.p.toFixed(3);
|
|
998
|
+
md += `| ${data.label} | ${bStr} | ${rStr} | ${data.d.toFixed(2)} | ${data.welch.t.toFixed(2)} | ${pStr} |\n`;
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
// Per-persona breakdown
|
|
1002
|
+
md += `\n### 1.2 Per-Persona Breakdown\n`;
|
|
1003
|
+
for (const [persona, pProfiles] of Object.entries(analysis.byPersona)) {
|
|
1004
|
+
const pBase = pProfiles.filter(p => p.condition === 'base').length;
|
|
1005
|
+
const pRecog = pProfiles.filter(p => p.condition === 'recognition').length;
|
|
1006
|
+
md += `\n#### ${persona} (base=${pBase}, recog=${pRecog})\n\n`;
|
|
1007
|
+
md += `| Metric | Base | Recog | d |\n|--------|------|-------|---|\n`;
|
|
1008
|
+
|
|
1009
|
+
for (const [key, data] of Object.entries(analysis.structural)) {
|
|
1010
|
+
if (!data.byPersona || !data.byPersona[persona]) continue;
|
|
1011
|
+
const bp = data.byPersona[persona];
|
|
1012
|
+
md += `| ${data.label} | ${bp.base.mean.toFixed(2)} | ${bp.recognition.mean.toFixed(2)} | ${bp.d.toFixed(2)} |\n`;
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
// Intervention distribution
|
|
1017
|
+
md += `\n### 1.3 Intervention Type Distribution\n\n`;
|
|
1018
|
+
md += `| Type | Base | Recognition |\n|------|------|-------------|\n`;
|
|
1019
|
+
const allTypes = [...new Set([
|
|
1020
|
+
...Object.keys(analysis.structural.interventionDistribution?.base || {}),
|
|
1021
|
+
...Object.keys(analysis.structural.interventionDistribution?.recognition || {}),
|
|
1022
|
+
])];
|
|
1023
|
+
for (const type of allTypes) {
|
|
1024
|
+
const b = analysis.structural.interventionDistribution?.base?.[type] || 0;
|
|
1025
|
+
const r = analysis.structural.interventionDistribution?.recognition?.[type] || 0;
|
|
1026
|
+
md += `| ${type} | ${b} | ${r} |\n`;
|
|
1027
|
+
}
|
|
1028
|
+
if (analysis.structural.interventionChiSquare) {
|
|
1029
|
+
const cs = analysis.structural.interventionChiSquare;
|
|
1030
|
+
const pStr = cs.p < 0.001 ? 'p < .001' : `p = ${cs.p.toFixed(3)}`;
|
|
1031
|
+
md += `\n**Chi-square:** χ²(${cs.df}) = ${cs.chi2.toFixed(2)}, ${pStr}, V = ${cs.cramersV.toFixed(3)}\n`;
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
// Learner action distribution
|
|
1035
|
+
md += `\n### 1.4 Learner Action Distribution\n\n`;
|
|
1036
|
+
md += `| Action | Base | Recognition |\n|--------|------|-------------|\n`;
|
|
1037
|
+
const allActions = [...new Set([
|
|
1038
|
+
...Object.keys(analysis.structural.learnerActionDistribution?.base || {}),
|
|
1039
|
+
...Object.keys(analysis.structural.learnerActionDistribution?.recognition || {}),
|
|
1040
|
+
])];
|
|
1041
|
+
for (const action of allActions) {
|
|
1042
|
+
const b = analysis.structural.learnerActionDistribution?.base?.[action] || 0;
|
|
1043
|
+
const r = analysis.structural.learnerActionDistribution?.recognition?.[action] || 0;
|
|
1044
|
+
md += `| ${action} | ${b} | ${r} |\n`;
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
// Correlations
|
|
1048
|
+
md += `\n### 1.5 Correlations with Overall Score\n\n`;
|
|
1049
|
+
md += `| Modulation Metric | r | p |\n|-------------------|---|---|\n`;
|
|
1050
|
+
for (const [key, corr] of Object.entries(analysis.correlations)) {
|
|
1051
|
+
const pStr = corr.p < 0.001 ? '<.001' : corr.p.toFixed(3);
|
|
1052
|
+
md += `| ${key.replace(/_/g, ' ')} | ${corr.r.toFixed(3)} | ${pStr} |\n`;
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1055
|
+
// LLM-coded metrics
|
|
1056
|
+
if (!opts.structuralOnly && Object.keys(analysis.llmCoded).length > 0) {
|
|
1057
|
+
md += `\n## 2. LLM-Coded Metrics (Tier 2)\n\n`;
|
|
1058
|
+
md += `| Metric | Base | Recog | Cohen's d | Welch t | p |\n`;
|
|
1059
|
+
md += `|--------|------|-------|-----------|---------|---|\n`;
|
|
1060
|
+
|
|
1061
|
+
const llmLabels = {
|
|
1062
|
+
stanceReversal: 'Stance Reversals (count)',
|
|
1063
|
+
crossTurnMemory: 'Cross-Turn Memory Rate',
|
|
1064
|
+
hallucinationCorrection: 'Hallucination Rate',
|
|
1065
|
+
phaseTransition: 'Phase Transition Density',
|
|
1066
|
+
};
|
|
1067
|
+
|
|
1068
|
+
for (const [key, data] of Object.entries(analysis.llmCoded)) {
|
|
1069
|
+
if (!data.base) continue;
|
|
1070
|
+
const bStr = `${data.base.mean.toFixed(3)} (${data.base.sd.toFixed(3)})`;
|
|
1071
|
+
const rStr = `${data.recognition.mean.toFixed(3)} (${data.recognition.sd.toFixed(3)})`;
|
|
1072
|
+
const pStr = data.welch.p < 0.001 ? '<.001' : data.welch.p.toFixed(3);
|
|
1073
|
+
md += `| ${llmLabels[key] || key} | ${bStr} | ${rStr} | ${data.d.toFixed(2)} | ${data.welch.t.toFixed(2)} | ${pStr} |\n`;
|
|
1074
|
+
}
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
// Per-cell summary table
|
|
1078
|
+
md += `\n## 3. Per-Cell Summary\n\n`;
|
|
1079
|
+
md += `| Cell | N | Mean Score | Mean Neg Depth | Mean Rounds | Mean Confidence |\n`;
|
|
1080
|
+
md += `|------|---|------------|----------------|-------------|------------------|\n`;
|
|
1081
|
+
for (const [cellKey, cellProfiles] of Object.entries(analysis.byCell)) {
|
|
1082
|
+
const scores = cellProfiles.map(p => p.overallScore).filter(v => v != null);
|
|
1083
|
+
const negDepths = cellProfiles.map(p => p.structural.aggregate.meanNegationDepth);
|
|
1084
|
+
const rounds = cellProfiles.map(p => p.structural.aggregate.meanRoundsToConverge);
|
|
1085
|
+
const confs = cellProfiles.map(p => p.structural.aggregate.meanConfidence).filter(v => v != null);
|
|
1086
|
+
md += `| ${cellKey} | ${cellProfiles.length} | ${mean(scores).toFixed(1)} | ${mean(negDepths).toFixed(2)} | ${mean(rounds).toFixed(2)} | ${confs.length > 0 ? mean(confs).toFixed(3) : 'N/A'} |\n`;
|
|
1087
|
+
}
|
|
1088
|
+
|
|
1089
|
+
// Exemplar dialogues
|
|
1090
|
+
md += `\n## 4. Exemplar Dialogues\n\n`;
|
|
1091
|
+
// Highest and lowest negation depth
|
|
1092
|
+
const sorted = [...profiles].sort((a, b) =>
|
|
1093
|
+
b.structural.aggregate.totalNegations - a.structural.aggregate.totalNegations
|
|
1094
|
+
);
|
|
1095
|
+
if (sorted.length > 0) {
|
|
1096
|
+
const high = sorted[0];
|
|
1097
|
+
const low = sorted[sorted.length - 1];
|
|
1098
|
+
md += `**Highest negation depth** (${high.structural.aggregate.totalNegations} total negations):\n`;
|
|
1099
|
+
md += `- ID: ${high.id}, ${high.condition}/${high.persona}, score=${high.overallScore?.toFixed(1)}\n`;
|
|
1100
|
+
md += `- Turns: ${high.structural.totalTurns}, convergence trajectory: ${high.structural.aggregate.convergenceTrajectory}\n\n`;
|
|
1101
|
+
md += `**Lowest negation depth** (${low.structural.aggregate.totalNegations} total negations):\n`;
|
|
1102
|
+
md += `- ID: ${low.id}, ${low.condition}/${low.persona}, score=${low.overallScore?.toFixed(1)}\n`;
|
|
1103
|
+
md += `- Turns: ${low.structural.totalTurns}, convergence trajectory: ${low.structural.aggregate.convergenceTrajectory}\n`;
|
|
1104
|
+
}
|
|
1105
|
+
|
|
1106
|
+
return md;
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
// ── CLI ──────────────────────────────────────────────────────────────────
|
|
1110
|
+
|
|
1111
|
+
function parseArgs() {
|
|
1112
|
+
const args = process.argv.slice(2);
|
|
1113
|
+
const opts = {
|
|
1114
|
+
model: 'claude-code',
|
|
1115
|
+
runId: DEFAULT_RUN_ID,
|
|
1116
|
+
structuralOnly: false,
|
|
1117
|
+
};
|
|
1118
|
+
for (let i = 0; i < args.length; i++) {
|
|
1119
|
+
switch (args[i]) {
|
|
1120
|
+
case '--model': opts.model = args[++i]; break;
|
|
1121
|
+
case '--run-id': opts.runId = args[++i]; break;
|
|
1122
|
+
case '--structural-only': opts.structuralOnly = true; break;
|
|
1123
|
+
case '--help':
|
|
1124
|
+
console.log(`Usage: node scripts/code-dialectical-modulation.js [options]
|
|
1125
|
+
|
|
1126
|
+
Options:
|
|
1127
|
+
--model <model> Model for LLM coding (default: claude-code)
|
|
1128
|
+
claude-code — Claude Code CLI (subscription, free)
|
|
1129
|
+
haiku — OpenRouter Haiku
|
|
1130
|
+
sonnet — OpenRouter Sonnet
|
|
1131
|
+
--run-id <id> Run ID (default: ${DEFAULT_RUN_ID})
|
|
1132
|
+
--structural-only Skip LLM coding, emit only structural metrics
|
|
1133
|
+
--help Show this help`);
|
|
1134
|
+
process.exit(0);
|
|
1135
|
+
}
|
|
1136
|
+
}
|
|
1137
|
+
return opts;
|
|
1138
|
+
}
|
|
1139
|
+
|
|
1140
|
+
// ── Main ─────────────────────────────────────────────────────────────────
|
|
1141
|
+
|
|
1142
|
+
async function main() {
|
|
1143
|
+
const opts = parseArgs();
|
|
1144
|
+
|
|
1145
|
+
const dbPath = path.join(process.cwd(), 'data', 'evaluations.db');
|
|
1146
|
+
if (!fs.existsSync(dbPath)) {
|
|
1147
|
+
console.error('Database not found:', dbPath);
|
|
1148
|
+
process.exit(1);
|
|
1149
|
+
}
|
|
1150
|
+
|
|
1151
|
+
const db = new Database(dbPath, { readonly: true });
|
|
1152
|
+
|
|
1153
|
+
console.log('='.repeat(70));
|
|
1154
|
+
console.log('DIALECTICAL MODULATION CODING');
|
|
1155
|
+
console.log('='.repeat(70));
|
|
1156
|
+
console.log(`Model: ${opts.model} | Run ID: ${opts.runId} | Mode: ${opts.structuralOnly ? 'structural only' : 'full'}`);
|
|
1157
|
+
|
|
1158
|
+
// Load rows
|
|
1159
|
+
const rows = loadRows(db, opts.runId);
|
|
1160
|
+
console.log(`\nLoaded ${rows.length} rows with dialogue IDs`);
|
|
1161
|
+
|
|
1162
|
+
if (rows.length === 0) {
|
|
1163
|
+
console.error('No rows found.');
|
|
1164
|
+
db.close();
|
|
1165
|
+
return;
|
|
1166
|
+
}
|
|
1167
|
+
|
|
1168
|
+
// Summary
|
|
1169
|
+
const condCounts = { base: 0, recognition: 0 };
|
|
1170
|
+
const personaCounts = {};
|
|
1171
|
+
for (const row of rows) {
|
|
1172
|
+
const { condition, persona } = parseCondition(row.profile_name);
|
|
1173
|
+
condCounts[condition]++;
|
|
1174
|
+
personaCounts[persona] = (personaCounts[persona] || 0) + 1;
|
|
1175
|
+
}
|
|
1176
|
+
console.log(` Base: ${condCounts.base}, Recognition: ${condCounts.recognition}`);
|
|
1177
|
+
console.log(` Personas: ${Object.entries(personaCounts).map(([k, v]) => `${k}=${v}`).join(', ')}`);
|
|
1178
|
+
|
|
1179
|
+
// Ensure exports directory
|
|
1180
|
+
const exportsDir = path.join(process.cwd(), 'exports');
|
|
1181
|
+
if (!fs.existsSync(exportsDir)) {
|
|
1182
|
+
fs.mkdirSync(exportsDir, { recursive: true });
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
|
|
1186
|
+
|
|
1187
|
+
// Process each dialogue
|
|
1188
|
+
const profiles = [];
|
|
1189
|
+
let loadErrors = 0;
|
|
1190
|
+
let llmErrors = 0;
|
|
1191
|
+
const startTime = Date.now();
|
|
1192
|
+
|
|
1193
|
+
for (let i = 0; i < rows.length; i++) {
|
|
1194
|
+
const row = rows[i];
|
|
1195
|
+
const progress = `[${i + 1}/${rows.length}]`;
|
|
1196
|
+
|
|
1197
|
+
// Load dialogue log
|
|
1198
|
+
const dialogueLog = loadDialogueLog(row.dialogue_id);
|
|
1199
|
+
if (!dialogueLog || !dialogueLog.dialogueTrace) {
|
|
1200
|
+
process.stdout.write(` ${progress} SKIP ${row.dialogue_id}: no dialogue log\n`);
|
|
1201
|
+
loadErrors++;
|
|
1202
|
+
continue;
|
|
1203
|
+
}
|
|
1204
|
+
|
|
1205
|
+
// Segment trace by turn
|
|
1206
|
+
const turns = segmentTraceByTurn(dialogueLog.dialogueTrace);
|
|
1207
|
+
if (turns.length === 0) {
|
|
1208
|
+
process.stdout.write(` ${progress} SKIP ${row.dialogue_id}: no turns found\n`);
|
|
1209
|
+
loadErrors++;
|
|
1210
|
+
continue;
|
|
1211
|
+
}
|
|
1212
|
+
|
|
1213
|
+
// Extract structural metrics
|
|
1214
|
+
const structural = extractStructuralMetrics(dialogueLog, turns);
|
|
1215
|
+
|
|
1216
|
+
// Extract LLM-coded metrics (if not structural-only)
|
|
1217
|
+
let llmCoded = null;
|
|
1218
|
+
if (!opts.structuralOnly) {
|
|
1219
|
+
process.stdout.write(` ${progress} ${row.dialogue_id} (${turns.length} turns) — LLM coding...`);
|
|
1220
|
+
llmCoded = await extractLLMCodedMetrics(turns, opts.model);
|
|
1221
|
+
if (llmCoded.errors.length > 0) {
|
|
1222
|
+
llmErrors += llmCoded.errors.length;
|
|
1223
|
+
console.log(` ${llmCoded.errors.length} errors`);
|
|
1224
|
+
} else {
|
|
1225
|
+
console.log(' done');
|
|
1226
|
+
}
|
|
1227
|
+
} else {
|
|
1228
|
+
process.stdout.write(` ${progress} ${row.dialogue_id} (${turns.length} turns) — structural\n`);
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
const profile = buildModulationProfile(row, dialogueLog, structural, llmCoded);
|
|
1232
|
+
profiles.push(profile);
|
|
1233
|
+
}
|
|
1234
|
+
|
|
1235
|
+
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
|
|
1236
|
+
console.log(`\nProcessing complete: ${profiles.length} profiles, ${loadErrors} load errors, ${llmErrors} LLM errors, ${elapsed}s`);
|
|
1237
|
+
|
|
1238
|
+
if (profiles.length === 0) {
|
|
1239
|
+
console.error('No profiles generated.');
|
|
1240
|
+
db.close();
|
|
1241
|
+
return;
|
|
1242
|
+
}
|
|
1243
|
+
|
|
1244
|
+
// Analyze
|
|
1245
|
+
const analysis = analyzeAggregateResults(profiles);
|
|
1246
|
+
|
|
1247
|
+
// Write outputs
|
|
1248
|
+
const jsonPath = path.join(exportsDir, `dialectical-modulation-${timestamp}.json`);
|
|
1249
|
+
fs.writeFileSync(jsonPath, JSON.stringify({
|
|
1250
|
+
generated: new Date().toISOString(),
|
|
1251
|
+
model: opts.model,
|
|
1252
|
+
runId: opts.runId,
|
|
1253
|
+
mode: opts.structuralOnly ? 'structural' : 'full',
|
|
1254
|
+
n: profiles.length,
|
|
1255
|
+
loadErrors,
|
|
1256
|
+
llmErrors,
|
|
1257
|
+
profiles,
|
|
1258
|
+
analysis,
|
|
1259
|
+
}, null, 2));
|
|
1260
|
+
console.log(`\nJSON: ${jsonPath}`);
|
|
1261
|
+
|
|
1262
|
+
const mdReport = generateReport(profiles, analysis, opts);
|
|
1263
|
+
const mdPath = path.join(exportsDir, `dialectical-modulation-${timestamp}.md`);
|
|
1264
|
+
fs.writeFileSync(mdPath, mdReport);
|
|
1265
|
+
console.log(`Markdown: ${mdPath}`);
|
|
1266
|
+
|
|
1267
|
+
// Print summary
|
|
1268
|
+
console.log('\n' + '─'.repeat(70));
|
|
1269
|
+
console.log('STRUCTURAL METRICS SUMMARY: Base vs Recognition');
|
|
1270
|
+
console.log('─'.repeat(70));
|
|
1271
|
+
console.log(`${'Metric'.padEnd(30)} ${'Base'.padEnd(14)} ${'Recog'.padEnd(14)} ${'d'.padEnd(8)} p`);
|
|
1272
|
+
console.log('─'.repeat(70));
|
|
1273
|
+
|
|
1274
|
+
for (const [key, data] of Object.entries(analysis.structural)) {
|
|
1275
|
+
if (key === 'interventionDistribution' || key === 'interventionChiSquare' || key === 'learnerActionDistribution') continue;
|
|
1276
|
+
const bStr = data.base.mean.toFixed(2);
|
|
1277
|
+
const rStr = data.recognition.mean.toFixed(2);
|
|
1278
|
+
const pStr = data.welch.p < 0.001 ? '<.001' : data.welch.p.toFixed(3);
|
|
1279
|
+
console.log(` ${data.label.padEnd(28)} ${bStr.padEnd(14)} ${rStr.padEnd(14)} ${data.d.toFixed(2).padEnd(8)} ${pStr}`);
|
|
1280
|
+
}
|
|
1281
|
+
|
|
1282
|
+
// Correlations
|
|
1283
|
+
if (Object.keys(analysis.correlations).length > 0) {
|
|
1284
|
+
console.log('\n' + '─'.repeat(70));
|
|
1285
|
+
console.log('CORRELATIONS WITH OVERALL SCORE');
|
|
1286
|
+
console.log('─'.repeat(70));
|
|
1287
|
+
for (const [key, corr] of Object.entries(analysis.correlations)) {
|
|
1288
|
+
const pStr = corr.p < 0.001 ? '<.001' : corr.p.toFixed(3);
|
|
1289
|
+
console.log(` ${key.replace(/_/g, ' ').padEnd(35)} r = ${corr.r.toFixed(3).padEnd(8)} p = ${pStr}`);
|
|
1290
|
+
}
|
|
1291
|
+
}
|
|
1292
|
+
|
|
1293
|
+
// LLM summary
|
|
1294
|
+
if (!opts.structuralOnly && Object.keys(analysis.llmCoded).length > 0) {
|
|
1295
|
+
console.log('\n' + '─'.repeat(70));
|
|
1296
|
+
console.log('LLM-CODED METRICS SUMMARY: Base vs Recognition');
|
|
1297
|
+
console.log('─'.repeat(70));
|
|
1298
|
+
|
|
1299
|
+
const llmLabels = {
|
|
1300
|
+
stanceReversal: 'Stance Reversals',
|
|
1301
|
+
crossTurnMemory: 'Cross-Turn Memory',
|
|
1302
|
+
hallucinationCorrection: 'Hallucination Rate',
|
|
1303
|
+
phaseTransition: 'Phase Transitions',
|
|
1304
|
+
};
|
|
1305
|
+
|
|
1306
|
+
for (const [key, data] of Object.entries(analysis.llmCoded)) {
|
|
1307
|
+
if (!data.base) continue;
|
|
1308
|
+
const pStr = data.welch.p < 0.001 ? '<.001' : data.welch.p.toFixed(3);
|
|
1309
|
+
console.log(` ${(llmLabels[key] || key).padEnd(28)} base=${data.base.mean.toFixed(3).padEnd(8)} recog=${data.recognition.mean.toFixed(3).padEnd(8)} d=${data.d.toFixed(2).padEnd(8)} p=${pStr}`);
|
|
1310
|
+
}
|
|
1311
|
+
}
|
|
1312
|
+
|
|
1313
|
+
db.close();
|
|
1314
|
+
console.log('\nDone.');
|
|
1315
|
+
}
|
|
1316
|
+
|
|
1317
|
+
main().catch(err => {
|
|
1318
|
+
console.error('Fatal error:', err);
|
|
1319
|
+
process.exit(1);
|
|
1320
|
+
});
|