@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/build.sh +44 -20
  19. package/docs/research/figures/figure10.png +0 -0
  20. package/docs/research/figures/figure11.png +0 -0
  21. package/docs/research/figures/figure3.png +0 -0
  22. package/docs/research/figures/figure4.png +0 -0
  23. package/docs/research/figures/figure5.png +0 -0
  24. package/docs/research/figures/figure6.png +0 -0
  25. package/docs/research/figures/figure7.png +0 -0
  26. package/docs/research/figures/figure8.png +0 -0
  27. package/docs/research/figures/figure9.png +0 -0
  28. package/docs/research/header.tex +23 -2
  29. package/docs/research/paper-full.md +941 -285
  30. package/docs/research/paper-short.md +216 -585
  31. package/docs/research/references.bib +132 -0
  32. package/docs/research/slides-header.tex +188 -0
  33. package/docs/research/slides-pptx.md +363 -0
  34. package/docs/research/slides.md +531 -0
  35. package/docs/research/style-reference-pptx.py +199 -0
  36. package/package.json +6 -5
  37. package/scripts/analyze-eval-results.js +69 -17
  38. package/scripts/analyze-mechanism-traces.js +763 -0
  39. package/scripts/analyze-modulation-learning.js +498 -0
  40. package/scripts/analyze-prosthesis.js +144 -0
  41. package/scripts/analyze-run.js +264 -79
  42. package/scripts/assess-transcripts.js +853 -0
  43. package/scripts/browse-transcripts.js +854 -0
  44. package/scripts/check-parse-failures.js +73 -0
  45. package/scripts/code-dialectical-modulation.js +1320 -0
  46. package/scripts/download-data.sh +55 -0
  47. package/scripts/eval-cli.js +106 -18
  48. package/scripts/generate-paper-figures.js +663 -0
  49. package/scripts/generate-paper-figures.py +577 -76
  50. package/scripts/generate-paper-tables.js +299 -0
  51. package/scripts/qualitative-analysis-ai.js +3 -3
  52. package/scripts/render-sequence-diagram.js +694 -0
  53. package/scripts/test-latency.js +210 -0
  54. package/scripts/test-rate-limit.js +95 -0
  55. package/scripts/test-token-budget.js +332 -0
  56. package/scripts/validate-paper-manifest.js +670 -0
  57. package/services/__tests__/evalConfigLoader.test.js +2 -2
  58. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  59. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  60. package/services/evaluationRunner.js +975 -98
  61. package/services/evaluationStore.js +12 -4
  62. package/services/learnerTutorInteractionEngine.js +27 -2
  63. package/services/mockProvider.js +133 -0
  64. package/services/promptRewriter.js +1471 -5
  65. package/services/rubricEvaluator.js +55 -2
  66. package/services/transcriptFormatter.js +675 -0
  67. package/docs/EVALUATION-VARIABLES.md +0 -589
  68. package/docs/REPLICATION-PLAN.md +0 -577
  69. package/scripts/analyze-run.mjs +0 -282
  70. package/scripts/compare-runs.js +0 -44
  71. package/scripts/compare-suggestions.js +0 -80
  72. package/scripts/dig-into-run.js +0 -158
  73. package/scripts/show-failed-suggestions.js +0 -64
  74. /package/scripts/{check-run.mjs → check-run.js} +0 -0
@@ -0,0 +1,1320 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Dialectical Modulation Coding Script
5
+ *
6
+ * Extracts structural and LLM-coded modulation metrics from multi-turn
7
+ * dialectical superego dialogues. Compares across conditions (base vs
8
+ * recognition) × persona type (suspicious, adversary, advocate).
9
+ *
10
+ * Tier 1 (structural): Parsed directly from dialogueTrace JSON — no LLM calls.
11
+ * Tier 2 (LLM-coded): 4 semantic prompts per dialogue for stance reversal,
12
+ * cross-turn memory, hallucination correction, phase transition detection.
13
+ *
14
+ * Usage:
15
+ * node scripts/code-dialectical-modulation.js [options]
16
+ *
17
+ * Options:
18
+ * --run-id <id> Run ID (default: eval-2026-02-11-a54235ea)
19
+ * --model <model> Model for LLM coding (default: claude-code)
20
+ * --structural-only Skip LLM coding, emit only structural metrics
21
+ * --help Show help
22
+ */
23
+
24
+ import 'dotenv/config';
25
+ import Database from 'better-sqlite3';
26
+ import fs from 'fs';
27
+ import path from 'path';
28
+ import { spawn } from 'child_process';
29
+
30
+ // ── Constants ────────────────────────────────────────────────────────────
31
+
32
+ const DEFAULT_RUN_ID = 'eval-2026-02-11-a54235ea';
33
+
34
+ const MODEL_MAP = {
35
+ 'claude-code': 'claude-code',
36
+ haiku: 'anthropic/claude-haiku-4.5',
37
+ sonnet: 'anthropic/claude-sonnet-4.5',
38
+ gpt: 'openai/gpt-5.2',
39
+ };
40
+
41
+ // ── Statistical Helpers ──────────────────────────────────────────────────
42
+
43
+ function mean(arr) {
44
+ if (!arr.length) return 0;
45
+ return arr.reduce((a, b) => a + b, 0) / arr.length;
46
+ }
47
+
48
+ function std(arr) {
49
+ if (arr.length < 2) return 0;
50
+ const m = mean(arr);
51
+ return Math.sqrt(arr.reduce((s, x) => s + (x - m) ** 2, 0) / (arr.length - 1));
52
+ }
53
+
54
+ function cohensD(group1, group2) {
55
+ if (!group1.length || !group2.length) return 0;
56
+ const m1 = mean(group1), m2 = mean(group2);
57
+ const s1 = std(group1), s2 = std(group2);
58
+ const pooled = Math.sqrt(
59
+ ((group1.length - 1) * s1 ** 2 + (group2.length - 1) * s2 ** 2)
60
+ / (group1.length + group2.length - 2)
61
+ );
62
+ return pooled > 0 ? (m1 - m2) / pooled : 0;
63
+ }
64
+
65
+ function welchTTest(group1, group2) {
66
+ if (group1.length < 2 || group2.length < 2) return { t: 0, df: 0, p: 1 };
67
+ const m1 = mean(group1), m2 = mean(group2);
68
+ const v1 = std(group1) ** 2, v2 = std(group2) ** 2;
69
+ const n1 = group1.length, n2 = group2.length;
70
+ const se = Math.sqrt(v1 / n1 + v2 / n2);
71
+ if (se === 0) return { t: 0, df: n1 + n2 - 2, p: 1 };
72
+ const t = (m1 - m2) / se;
73
+ const num = (v1 / n1 + v2 / n2) ** 2;
74
+ const den = (v1 / n1) ** 2 / (n1 - 1) + (v2 / n2) ** 2 / (n2 - 1);
75
+ const df = den > 0 ? num / den : n1 + n2 - 2;
76
+ const p = tTestPValue(Math.abs(t), df);
77
+ return { t, df, p };
78
+ }
79
+
80
+ function tTestPValue(t, df) {
81
+ // Approximate two-tailed p-value using normal approximation for large df
82
+ if (df <= 0) return 1;
83
+ if (df > 30) {
84
+ return 2 * (1 - normalCDF(Math.abs(t)));
85
+ }
86
+ // For small df, use a rough beta-function based approximation
87
+ const x = df / (df + t * t);
88
+ return regularizedBeta(x, df / 2, 0.5);
89
+ }
90
+
91
+ function regularizedBeta(x, a, b) {
92
+ // Simple continued-fraction approximation for the regularized incomplete beta
93
+ if (x <= 0) return 0;
94
+ if (x >= 1) return 1;
95
+ const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
96
+ const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta);
97
+ // Lentz's continued fraction
98
+ let f = 1, c = 1, d = 1 - (a + 1) * x / (a + 1);
99
+ if (Math.abs(d) < 1e-30) d = 1e-30;
100
+ d = 1 / d;
101
+ f = d;
102
+ for (let m = 1; m <= 200; m++) {
103
+ let numerator;
104
+ if (m % 2 === 0) {
105
+ const k = m / 2;
106
+ numerator = k * (b - k) * x / ((a + 2 * k - 1) * (a + 2 * k));
107
+ } else {
108
+ const k = (m - 1) / 2;
109
+ numerator = -(a + k) * (a + b + k) * x / ((a + 2 * k) * (a + 2 * k + 1));
110
+ }
111
+ d = 1 + numerator * d;
112
+ if (Math.abs(d) < 1e-30) d = 1e-30;
113
+ d = 1 / d;
114
+ c = 1 + numerator / c;
115
+ if (Math.abs(c) < 1e-30) c = 1e-30;
116
+ f *= c * d;
117
+ if (Math.abs(c * d - 1) < 1e-8) break;
118
+ }
119
+ return front * f / a;
120
+ }
121
+
122
+ function lnGamma(z) {
123
+ // Stirling's approximation
124
+ const c = [76.18009172947146, -86.50532032941677, 24.01409824083091,
125
+ -1.231739572450155, 0.001208650973866179, -0.000005395239384953];
126
+ let x = z, y = z;
127
+ let tmp = x + 5.5;
128
+ tmp -= (x + 0.5) * Math.log(tmp);
129
+ let ser = 1.000000000190015;
130
+ for (let j = 0; j < 6; j++) ser += c[j] / ++y;
131
+ return -tmp + Math.log(2.5066282746310005 * ser / x);
132
+ }
133
+
134
+ function normalCDF(x) {
135
+ const a1 = 0.254829592, a2 = -0.284496736, a3 = 1.421413741;
136
+ const a4 = -1.453152027, a5 = 1.061405429, p = 0.3275911;
137
+ const sign = x < 0 ? -1 : 1;
138
+ const ax = Math.abs(x) / Math.SQRT2;
139
+ const t = 1 / (1 + p * ax);
140
+ const y = 1 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * Math.exp(-ax * ax);
141
+ return 0.5 * (1 + sign * y);
142
+ }
143
+
144
+ function chiSquareTest(observed) {
145
+ const nRows = observed.length;
146
+ const nCols = observed[0].length;
147
+ const rowTotals = observed.map(row => row.reduce((a, b) => a + b, 0));
148
+ const colTotals = [];
149
+ for (let j = 0; j < nCols; j++) {
150
+ colTotals.push(observed.reduce((sum, row) => sum + row[j], 0));
151
+ }
152
+ const grand = rowTotals.reduce((a, b) => a + b, 0);
153
+ if (grand === 0) return { chi2: 0, df: 0, p: 1, cramersV: 0 };
154
+
155
+ let chi2 = 0;
156
+ for (let i = 0; i < nRows; i++) {
157
+ for (let j = 0; j < nCols; j++) {
158
+ const expected = (rowTotals[i] * colTotals[j]) / grand;
159
+ if (expected > 0) {
160
+ chi2 += (observed[i][j] - expected) ** 2 / expected;
161
+ }
162
+ }
163
+ }
164
+ const df = (nRows - 1) * (nCols - 1);
165
+ const k = Math.min(nRows, nCols);
166
+ const cramersV = grand > 0 && k > 1 ? Math.sqrt(chi2 / (grand * (k - 1))) : 0;
167
+ const p = chi2PValue(chi2, df);
168
+ return { chi2, df, p, cramersV };
169
+ }
170
+
171
+ function chi2PValue(x, df) {
172
+ if (df <= 0 || x <= 0) return 1;
173
+ if (df > 2) {
174
+ const cube = 1 - 2 / (9 * df);
175
+ const stdNorm = (Math.pow(x / df, 1 / 3) - cube) / Math.sqrt(2 / (9 * df));
176
+ return 1 - normalCDF(stdNorm);
177
+ }
178
+ if (df === 1) return 2 * (1 - normalCDF(Math.sqrt(x)));
179
+ if (df === 2) return Math.exp(-x / 2);
180
+ return 1;
181
+ }
182
+
183
+ function pearsonR(xs, ys) {
184
+ if (xs.length < 3) return { r: 0, p: 1 };
185
+ const n = xs.length;
186
+ const mx = mean(xs), my = mean(ys);
187
+ let num = 0, dx2 = 0, dy2 = 0;
188
+ for (let i = 0; i < n; i++) {
189
+ const dx = xs[i] - mx, dy = ys[i] - my;
190
+ num += dx * dy;
191
+ dx2 += dx * dx;
192
+ dy2 += dy * dy;
193
+ }
194
+ const denom = Math.sqrt(dx2 * dy2);
195
+ if (denom === 0) return { r: 0, p: 1 };
196
+ const r = num / denom;
197
+ const t = r * Math.sqrt((n - 2) / (1 - r * r));
198
+ const p = tTestPValue(Math.abs(t), n - 2);
199
+ return { r, p };
200
+ }
201
+
202
+ // ── Model Calls ──────────────────────────────────────────────────────────
203
+
204
+ async function callModel(prompt, modelKey) {
205
+ if (modelKey === 'claude-code') return callClaudeCode(prompt);
206
+ return callOpenRouter(prompt, modelKey);
207
+ }
208
+
209
+ async function callClaudeCode(prompt) {
210
+ const stdout = await new Promise((resolve, reject) => {
211
+ const env = { ...process.env };
212
+ delete env.ANTHROPIC_API_KEY;
213
+ const child = spawn('claude', ['-p', '-', '--output-format', 'text'], {
214
+ stdio: ['pipe', 'pipe', 'pipe'],
215
+ env,
216
+ });
217
+ let out = '';
218
+ let err = '';
219
+ child.stdout.on('data', d => { out += d; });
220
+ child.stderr.on('data', d => { err += d; });
221
+ child.on('error', e => reject(new Error(`Failed to spawn claude: ${e.message}`)));
222
+ child.on('close', code => {
223
+ if (code !== 0) reject(new Error(err || out || `claude exited with code ${code}`));
224
+ else resolve(out);
225
+ });
226
+ child.stdin.write(prompt);
227
+ child.stdin.end();
228
+ });
229
+ return stdout.trim();
230
+ }
231
+
232
+ async function callOpenRouter(prompt, modelKey) {
233
+ const apiKey = process.env.OPENROUTER_API_KEY;
234
+ if (!apiKey) throw new Error('OPENROUTER_API_KEY not set');
235
+ const model = MODEL_MAP[modelKey];
236
+ if (!model) throw new Error(`Unknown model: ${modelKey}`);
237
+
238
+ const controller = new AbortController();
239
+ const timeout = setTimeout(() => controller.abort(), 120000);
240
+ try {
241
+ const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
242
+ method: 'POST',
243
+ headers: {
244
+ 'Content-Type': 'application/json',
245
+ 'Authorization': `Bearer ${apiKey}`,
246
+ },
247
+ body: JSON.stringify({
248
+ model,
249
+ max_tokens: 2000,
250
+ temperature: 0.1,
251
+ include_reasoning: false,
252
+ response_format: { type: 'json_object' },
253
+ messages: [{ role: 'user', content: prompt }],
254
+ }),
255
+ signal: controller.signal,
256
+ });
257
+ clearTimeout(timeout);
258
+ if (!res.ok) {
259
+ const body = await res.text();
260
+ throw new Error(`OpenRouter ${res.status}: ${body.slice(0, 200)}`);
261
+ }
262
+ const data = await res.json();
263
+ const content = data.choices?.[0]?.message?.content;
264
+ if (!content) throw new Error('No content in response');
265
+ return content;
266
+ } catch (err) {
267
+ clearTimeout(timeout);
268
+ throw err;
269
+ }
270
+ }
271
+
272
+ function parseJsonResponse(content) {
273
+ try {
274
+ return JSON.parse(content);
275
+ } catch {
276
+ const match = content.match(/```(?:json)?\s*([\s\S]*?)```/);
277
+ if (match) return JSON.parse(match[1].trim());
278
+ throw new Error(`Failed to parse JSON: ${content.slice(0, 300)}`);
279
+ }
280
+ }
281
+
282
+ // ── Data Loading ─────────────────────────────────────────────────────────
283
+
284
+ function loadRows(db, runId) {
285
+ return db.prepare(`
286
+ SELECT id, dialogue_id, scenario_id, profile_name, overall_score,
287
+ dialogue_rounds, suggestions
288
+ FROM evaluation_results
289
+ WHERE run_id = ? AND success = 1 AND dialogue_id IS NOT NULL
290
+ ORDER BY profile_name, id
291
+ `).all(runId);
292
+ }
293
+
294
+ function parseCondition(profileName) {
295
+ const isRecognition = profileName.includes('recog');
296
+ let persona = 'unknown';
297
+ if (profileName.includes('suspicious')) persona = 'suspicious';
298
+ else if (profileName.includes('adversary')) persona = 'adversary';
299
+ else if (profileName.includes('advocate')) persona = 'advocate';
300
+ return {
301
+ condition: isRecognition ? 'recognition' : 'base',
302
+ persona,
303
+ };
304
+ }
305
+
306
+ function loadDialogueLog(dialogueId) {
307
+ const logPath = path.join(process.cwd(), 'logs', 'tutor-dialogues', `${dialogueId}.json`);
308
+ if (!fs.existsSync(logPath)) return null;
309
+ return JSON.parse(fs.readFileSync(logPath, 'utf8'));
310
+ }
311
+
312
+ // ── Tier 1: Structural Metrics ───────────────────────────────────────────
313
+
314
+ /**
315
+ * Segments dialogueTrace into external turns, delimited by final_output entries.
316
+ * Each turn contains the ego-superego negotiation entries plus the learner action.
317
+ */
318
+ function segmentTraceByTurn(dialogueTrace) {
319
+ const turns = [];
320
+ let currentEntries = [];
321
+
322
+ for (const entry of dialogueTrace) {
323
+ if (entry.action === 'final_output') {
324
+ turns.push({
325
+ turnIndex: entry.turnIndex != null ? entry.turnIndex : turns.length,
326
+ entries: currentEntries,
327
+ finalOutput: entry,
328
+ });
329
+ currentEntries = [];
330
+ } else {
331
+ currentEntries.push(entry);
332
+ }
333
+ }
334
+
335
+ // If no final_output markers found, treat the entire trace as a single turn
336
+ if (turns.length === 0 && currentEntries.length > 0) {
337
+ turns.push({
338
+ turnIndex: 0,
339
+ entries: currentEntries,
340
+ finalOutput: null,
341
+ });
342
+ }
343
+
344
+ // Enrich each turn with parsed sub-components
345
+ return turns.map(turn => {
346
+ const superegoEntries = turn.entries.filter(e => e.agent === 'superego');
347
+ const egoEntries = turn.entries.filter(e => e.agent === 'ego');
348
+ const learnerAction = turn.entries.find(e => e.action === 'turn_action');
349
+ const contextInput = turn.entries.find(e => e.action === 'context_input');
350
+
351
+ return {
352
+ turnIndex: turn.turnIndex,
353
+ superegoEntries,
354
+ egoEntries,
355
+ learnerAction,
356
+ contextInput,
357
+ allEntries: turn.entries,
358
+ finalOutput: turn.finalOutput,
359
+ };
360
+ });
361
+ }
362
+
363
+ function extractStructuralMetrics(dialogueLog, turns) {
364
+ const metrics = {
365
+ totalTurns: turns.length,
366
+ perTurn: [],
367
+ aggregate: {},
368
+ };
369
+
370
+ // Per-turn metrics
371
+ for (const turn of turns) {
372
+ const rejections = turn.superegoEntries.filter(e => e.approved === false);
373
+ const approvals = turn.superegoEntries.filter(e => e.approved === true);
374
+ const confidences = turn.superegoEntries
375
+ .map(e => e.confidence)
376
+ .filter(c => c != null);
377
+
378
+ // Intervention type distribution for this turn
379
+ const interventionTypes = {};
380
+ for (const se of turn.superegoEntries) {
381
+ const it = se.interventionType || 'unknown';
382
+ interventionTypes[it] = (interventionTypes[it] || 0) + 1;
383
+ }
384
+
385
+ // Rounds to convergence: number of ego-superego exchanges before final_output
386
+ const roundsToConverge = turn.superegoEntries.length;
387
+
388
+ // Ego suggestion changes: track actionType/actionTarget shifts across revisions
389
+ const egoSuggestionTypes = turn.egoEntries
390
+ .filter(e => e.suggestions && e.suggestions[0])
391
+ .map(e => ({
392
+ actionType: e.suggestions[0].actionType,
393
+ actionTarget: e.suggestions[0].actionTarget,
394
+ type: e.suggestions[0].type,
395
+ }));
396
+
397
+ const typeShifts = countShifts(egoSuggestionTypes.map(s => `${s.actionType}:${s.actionTarget}`));
398
+
399
+ // Learner action for this turn
400
+ const learnerDetail = turn.learnerAction?.detail || null;
401
+
402
+ metrics.perTurn.push({
403
+ turnIndex: turn.turnIndex,
404
+ negationDepth: rejections.length,
405
+ approvalCount: approvals.length,
406
+ roundsToConverge,
407
+ confidences,
408
+ meanConfidence: confidences.length > 0 ? mean(confidences) : null,
409
+ interventionTypes,
410
+ suggestionTypeShifts: typeShifts,
411
+ learnerAction: learnerDetail,
412
+ superegoFeedbackLengths: turn.superegoEntries
413
+ .map(e => (e.feedback || '').length)
414
+ .filter(l => l > 0),
415
+ });
416
+ }
417
+
418
+ // Aggregate metrics across all turns
419
+ const allNegationDepths = metrics.perTurn.map(t => t.negationDepth);
420
+ const allRoundsToConverge = metrics.perTurn.map(t => t.roundsToConverge);
421
+ const allConfidences = metrics.perTurn.flatMap(t => t.confidences);
422
+ const allFeedbackLengths = metrics.perTurn.flatMap(t => t.superegoFeedbackLengths);
423
+
424
+ // Confidence trajectory: first turn vs last turn
425
+ const firstTurnConf = metrics.perTurn[0]?.meanConfidence;
426
+ const lastTurnConf = metrics.perTurn.length > 1
427
+ ? metrics.perTurn[metrics.perTurn.length - 1]?.meanConfidence
428
+ : null;
429
+
430
+ // Intervention type distribution across all turns
431
+ const totalInterventions = {};
432
+ for (const pt of metrics.perTurn) {
433
+ for (const [type, count] of Object.entries(pt.interventionTypes)) {
434
+ totalInterventions[type] = (totalInterventions[type] || 0) + count;
435
+ }
436
+ }
437
+
438
+ // Learner action trajectory
439
+ const learnerActions = metrics.perTurn
440
+ .map(t => t.learnerAction)
441
+ .filter(Boolean);
442
+
443
+ // Convergence speed trajectory (does negotiation get faster?)
444
+ const convergenceTrajectory = allRoundsToConverge.length > 1
445
+ ? allRoundsToConverge[allRoundsToConverge.length - 1] - allRoundsToConverge[0]
446
+ : 0;
447
+
448
+ metrics.aggregate = {
449
+ meanNegationDepth: mean(allNegationDepths),
450
+ totalNegations: allNegationDepths.reduce((a, b) => a + b, 0),
451
+ meanRoundsToConverge: mean(allRoundsToConverge),
452
+ sdRoundsToConverge: std(allRoundsToConverge),
453
+ convergenceTrajectory,
454
+ meanConfidence: allConfidences.length > 0 ? mean(allConfidences) : null,
455
+ confidenceTrajectory: firstTurnConf != null && lastTurnConf != null
456
+ ? lastTurnConf - firstTurnConf
457
+ : null,
458
+ totalInterventions,
459
+ meanFeedbackLength: allFeedbackLengths.length > 0 ? mean(allFeedbackLengths) : 0,
460
+ learnerActionSequence: learnerActions,
461
+ };
462
+
463
+ // Incorporation rate from transformationAnalysis if available
464
+ const ta = dialogueLog.transformationAnalysis;
465
+ if (ta?.markerAnalysis) {
466
+ metrics.aggregate.incorporationRate = ta.markerAnalysis;
467
+ }
468
+
469
+ return metrics;
470
+ }
471
+
472
+ function countShifts(sequence) {
473
+ let shifts = 0;
474
+ for (let i = 1; i < sequence.length; i++) {
475
+ if (sequence[i] !== sequence[i - 1]) shifts++;
476
+ }
477
+ return shifts;
478
+ }
479
+
480
+ // ── Tier 2: LLM-Coded Metrics ───────────────────────────────────────────
481
+
482
+ function buildStanceReversalPrompt(turns) {
483
+ // Compare consecutive pairs of superego feedback
484
+ const pairs = [];
485
+ for (let i = 0; i < turns.length - 1; i++) {
486
+ const feedbackA = turns[i].superegoEntries
487
+ .map(e => e.feedback)
488
+ .filter(Boolean)
489
+ .join('\n');
490
+ const feedbackB = turns[i + 1].superegoEntries
491
+ .map(e => e.feedback)
492
+ .filter(Boolean)
493
+ .join('\n');
494
+ if (feedbackA && feedbackB) {
495
+ pairs.push({
496
+ turnA: turns[i].turnIndex,
497
+ turnB: turns[i + 1].turnIndex,
498
+ feedbackA: feedbackA.slice(0, 600),
499
+ feedbackB: feedbackB.slice(0, 600),
500
+ });
501
+ }
502
+ }
503
+
504
+ if (pairs.length === 0) return null;
505
+
506
+ const pairsText = pairs.map((p, i) =>
507
+ `### Pair ${i + 1} (Turn ${p.turnA} → Turn ${p.turnB})\n**Turn ${p.turnA} superego feedback:**\n${p.feedbackA}\n\n**Turn ${p.turnB} superego feedback:**\n${p.feedbackB}`
508
+ ).join('\n\n');
509
+
510
+ return `You are analyzing ego-superego dialogue traces from an AI tutoring system. The superego reviews and critiques the ego's suggestions across multiple turns of a tutoring conversation.
511
+
512
+ ## Task: Detect Stance Reversals
513
+
514
+ For each consecutive pair of superego feedback, determine whether the superego's evaluative stance REVERSED between turns — that is, whether it contradicted or substantially changed its position on what matters in the tutor's response.
515
+
516
+ A stance reversal is NOT just giving different feedback about different content. It means the superego's priorities, values, or evaluative criteria shifted (e.g., first prioritizing emotional validation, then deprioritizing it; first rejecting a pedagogical approach, then endorsing a similar one).
517
+
518
+ ${pairsText}
519
+
520
+ ## Output Format
521
+
522
+ Return a JSON object:
523
+ {
524
+ "pairs": [
525
+ {
526
+ "turnA": <number>,
527
+ "turnB": <number>,
528
+ "reversed": true|false,
529
+ "directionA": "brief description of superego's stance in turn A (max 15 words)",
530
+ "directionB": "brief description of superego's stance in turn B (max 15 words)",
531
+ "reversal_type": "priority_shift|criteria_change|contradiction|none"
532
+ }
533
+ ],
534
+ "total_reversals": <number>
535
+ }
536
+
537
+ Return ONLY the JSON object.`;
538
+ }
539
+
540
+ function buildCrossTurnMemoryPrompt(turns) {
541
+ const turnData = [];
542
+ for (let i = 0; i < turns.length; i++) {
543
+ const feedback = turns[i].superegoEntries
544
+ .map(e => e.feedback)
545
+ .filter(Boolean)
546
+ .join('\n');
547
+ const priorTurnSummaries = turns.slice(0, i)
548
+ .map(t => {
549
+ const fb = t.superegoEntries
550
+ .map(e => e.feedback)
551
+ .filter(Boolean)
552
+ .join('; ')
553
+ .slice(0, 200);
554
+ return `Turn ${t.turnIndex}: ${fb || '(no feedback)'}`;
555
+ });
556
+
557
+ if (feedback && priorTurnSummaries.length > 0) {
558
+ turnData.push({
559
+ turnIndex: turns[i].turnIndex,
560
+ feedback: feedback.slice(0, 600),
561
+ priorSummaries: priorTurnSummaries,
562
+ });
563
+ }
564
+ }
565
+
566
+ if (turnData.length === 0) return null;
567
+
568
+ const turnText = turnData.map(t =>
569
+ `### Turn ${t.turnIndex}\n**Prior turns:**\n${t.priorSummaries.join('\n')}\n\n**Current superego feedback:**\n${t.feedback}`
570
+ ).join('\n\n');
571
+
572
+ return `You are analyzing ego-superego dialogue traces from an AI tutoring system. The superego reviews the ego's suggestions across multiple external turns.
573
+
574
+ ## Task: Detect Cross-Turn Memory References
575
+
576
+ For each turn's superego feedback, determine whether it explicitly or implicitly references feedback, decisions, or content from prior turns. This measures whether the superego maintains coherent memory across the dialogue.
577
+
578
+ Types of references:
579
+ - **explicit_reference**: Directly mentions a prior turn's decision or content
580
+ - **implicit_callback**: Uses phrasing or criteria that echo prior feedback
581
+ - **escalation**: Builds on prior criticism (e.g., "still not addressing...")
582
+ - **reversal_acknowledgment**: Notes a change from prior approach
583
+
584
+ ${turnText}
585
+
586
+ ## Output Format
587
+
588
+ Return a JSON object:
589
+ {
590
+ "turns": [
591
+ {
592
+ "turnIndex": <number>,
593
+ "references_prior": true|false,
594
+ "reference_types": ["explicit_reference"|"implicit_callback"|"escalation"|"reversal_acknowledgment"],
595
+ "evidence": "brief quote or description (max 20 words)"
596
+ }
597
+ ],
598
+ "total_references": <number>,
599
+ "memory_rate": <0-1 fraction of turns with cross-turn references>
600
+ }
601
+
602
+ Return ONLY the JSON object.`;
603
+ }
604
+
605
+ function buildHallucinationCorrectionPrompt(turns) {
606
+ const rejections = [];
607
+ for (const turn of turns) {
608
+ const ctx = turn.contextInput?.rawContext?.slice(0, 400) || '(no context)';
609
+ for (const se of turn.superegoEntries) {
610
+ if (se.approved === false && se.feedback) {
611
+ // Get the ego suggestion that was rejected
612
+ const egoIdx = turn.allEntries.indexOf(se) - 1;
613
+ const egoEntry = egoIdx >= 0 ? turn.allEntries[egoIdx] : null;
614
+ const egoMessage = egoEntry?.suggestions?.[0]?.message?.slice(0, 400) || '(no suggestion)';
615
+
616
+ rejections.push({
617
+ turnIndex: turn.turnIndex,
618
+ round: se.round,
619
+ egoSuggestion: egoMessage,
620
+ superegoFeedback: se.feedback.slice(0, 400),
621
+ context: ctx,
622
+ });
623
+ }
624
+ }
625
+ }
626
+
627
+ if (rejections.length === 0) return null;
628
+
629
+ // Limit to first 6 rejections to keep prompt manageable
630
+ const subset = rejections.slice(0, 6);
631
+
632
+ const rejectText = subset.map((r, i) =>
633
+ `### Rejection ${i + 1} (Turn ${r.turnIndex}, Round ${r.round})\n**Context:** ${r.context}\n**Ego suggestion:** ${r.egoSuggestion}\n**Superego rejection:** ${r.superegoFeedback}`
634
+ ).join('\n\n');
635
+
636
+ return `You are analyzing ego-superego dialogue traces from an AI tutoring system. The superego sometimes rejects the ego's suggestions.
637
+
638
+ ## Task: Detect Hallucination Corrections
639
+
640
+ For each rejection, determine whether the superego is correcting a "hallucination" — a case where the ego fabricated, misrepresented, or ignored factual information from the learner context. Types of hallucination:
641
+ - **context_fabrication**: Ego claims learner said/did something not in the context
642
+ - **context_omission**: Ego ignores explicit learner signals present in context
643
+ - **metric_misuse**: Ego references metrics (struggle signals, sessions) inaccurately
644
+ - **repetition_blindness**: Ego repeats a suggestion that already failed in a prior turn
645
+
646
+ Not all rejections are hallucination corrections — the superego may reject for tone, pedagogy, or framing reasons without detecting hallucination. Code only genuine factual corrections.
647
+
648
+ ${rejectText}
649
+
650
+ ## Output Format
651
+
652
+ Return a JSON object:
653
+ {
654
+ "rejections": [
655
+ {
656
+ "turnIndex": <number>,
657
+ "round": <number>,
658
+ "hallucination_detected": true|false,
659
+ "types": ["context_fabrication"|"context_omission"|"metric_misuse"|"repetition_blindness"],
660
+ "description": "brief description (max 20 words)"
661
+ }
662
+ ],
663
+ "total_hallucinations": <number>,
664
+ "hallucination_rate": <0-1 fraction of rejections containing hallucination>
665
+ }
666
+
667
+ Return ONLY the JSON object.`;
668
+ }
669
+
670
+ function buildPhaseTransitionPrompt(turns) {
671
+ // Build the learner message sequence + superego response characterization
672
+ const sequence = [];
673
+ for (const turn of turns) {
674
+ const learnerMsg = turn.learnerAction?.contextSummary
675
+ || turn.contextInput?.rawContext?.slice(0, 200)
676
+ || '(initial turn)';
677
+ const superegoStance = turn.superegoEntries
678
+ .filter(e => e.feedback)
679
+ .map(e => e.feedback.slice(0, 200))
680
+ .join(' | ');
681
+ const learnerDetail = turn.learnerAction?.detail || 'initial';
682
+
683
+ sequence.push({
684
+ turnIndex: turn.turnIndex,
685
+ learnerMessage: typeof learnerMsg === 'string' ? learnerMsg.slice(0, 300) : '(no message)',
686
+ learnerAction: learnerDetail,
687
+ superegoStance: superegoStance.slice(0, 400) || '(no superego feedback)',
688
+ });
689
+ }
690
+
691
+ if (sequence.length < 2) return null;
692
+
693
+ const seqText = sequence.map(s =>
694
+ `### Turn ${s.turnIndex}\n**Learner** [${s.learnerAction}]: ${s.learnerMessage}\n**Superego stance:** ${s.superegoStance}`
695
+ ).join('\n\n');
696
+
697
+ return `You are analyzing multi-turn ego-superego tutoring dialogues. The learner interacts with a tutor over multiple turns, and the superego reviews each of the tutor's responses.
698
+
699
+ ## Task: Detect Phase Transitions
700
+
701
+ A phase transition occurs when the dialogue qualitatively shifts — the learner's engagement mode changes, the superego's evaluative priorities pivot, or the ego-superego dynamic fundamentally reorganizes. Types:
702
+ - **learner_mode_shift**: Learner moves from confusion to engagement, resistance to curiosity, etc.
703
+ - **superego_priority_pivot**: Superego shifts primary concern (e.g., from tone to content accuracy)
704
+ - **negotiation_pattern_change**: Ego-superego dynamic changes (e.g., from adversarial to cooperative)
705
+ - **pedagogical_escalation**: Tutor approach fundamentally changes strategy (review → practice, explain → scaffold)
706
+
707
+ ## Dialogue Sequence
708
+
709
+ ${seqText}
710
+
711
+ ## Output Format
712
+
713
+ Return a JSON object:
714
+ {
715
+ "transitions": [
716
+ {
717
+ "between_turns": [<turnA>, <turnB>],
718
+ "shift_type": "learner_mode_shift|superego_priority_pivot|negotiation_pattern_change|pedagogical_escalation",
719
+ "description": "brief description (max 20 words)",
720
+ "superego_adapts": true|false
721
+ }
722
+ ],
723
+ "total_transitions": <number>,
724
+ "transition_density": <transitions per inter-turn gap>
725
+ }
726
+
727
+ Return ONLY the JSON object.`;
728
+ }
729
+
730
+ async function extractLLMCodedMetrics(turns, modelKey) {
731
+ const llmMetrics = {
732
+ stanceReversal: null,
733
+ crossTurnMemory: null,
734
+ hallucinationCorrection: null,
735
+ phaseTransition: null,
736
+ errors: [],
737
+ };
738
+
739
+ const prompts = [
740
+ { key: 'stanceReversal', builder: buildStanceReversalPrompt },
741
+ { key: 'crossTurnMemory', builder: buildCrossTurnMemoryPrompt },
742
+ { key: 'hallucinationCorrection', builder: buildHallucinationCorrectionPrompt },
743
+ { key: 'phaseTransition', builder: buildPhaseTransitionPrompt },
744
+ ];
745
+
746
+ for (const { key, builder } of prompts) {
747
+ const prompt = builder(turns);
748
+ if (!prompt) {
749
+ llmMetrics[key] = { skipped: true, reason: 'insufficient data' };
750
+ continue;
751
+ }
752
+
753
+ try {
754
+ const content = await callModel(prompt, modelKey);
755
+ llmMetrics[key] = parseJsonResponse(content);
756
+ } catch (err) {
757
+ llmMetrics.errors.push({ metric: key, error: err.message });
758
+ llmMetrics[key] = { error: err.message };
759
+ }
760
+ }
761
+
762
+ return llmMetrics;
763
+ }
764
+
765
+ // ── Profile Assembly ─────────────────────────────────────────────────────
766
+
767
+ function buildModulationProfile(row, dialogueLog, structural, llmCoded) {
768
+ const { condition, persona } = parseCondition(row.profile_name);
769
+ return {
770
+ id: row.id,
771
+ dialogueId: row.dialogue_id,
772
+ scenarioId: row.scenario_id,
773
+ profileName: row.profile_name,
774
+ condition,
775
+ persona,
776
+ overallScore: row.overall_score,
777
+ dialogueRounds: row.dialogue_rounds,
778
+ structural,
779
+ llmCoded: llmCoded || null,
780
+ };
781
+ }
782
+
783
+ // ── Aggregate Analysis ───────────────────────────────────────────────────
784
+
785
+ function analyzeAggregateResults(profiles) {
786
+ const analysis = {
787
+ n: profiles.length,
788
+ byCondition: { base: [], recognition: [] },
789
+ byPersona: {},
790
+ byCell: {},
791
+ structural: {},
792
+ llmCoded: {},
793
+ correlations: {},
794
+ };
795
+
796
+ // Group profiles
797
+ for (const p of profiles) {
798
+ analysis.byCondition[p.condition].push(p);
799
+ if (!analysis.byPersona[p.persona]) analysis.byPersona[p.persona] = [];
800
+ analysis.byPersona[p.persona].push(p);
801
+ const cellKey = `${p.condition}_${p.persona}`;
802
+ if (!analysis.byCell[cellKey]) analysis.byCell[cellKey] = [];
803
+ analysis.byCell[cellKey].push(p);
804
+ }
805
+
806
+ // ── Structural Metric Comparisons ──────────────────────────────────
807
+
808
+ const structuralMetrics = [
809
+ { key: 'meanNegationDepth', label: 'Mean Negation Depth', extract: p => p.structural.aggregate.meanNegationDepth },
810
+ { key: 'totalNegations', label: 'Total Negations', extract: p => p.structural.aggregate.totalNegations },
811
+ { key: 'meanRoundsToConverge', label: 'Mean Rounds to Converge', extract: p => p.structural.aggregate.meanRoundsToConverge },
812
+ { key: 'convergenceTrajectory', label: 'Convergence Trajectory', extract: p => p.structural.aggregate.convergenceTrajectory },
813
+ { key: 'meanConfidence', label: 'Mean Superego Confidence', extract: p => p.structural.aggregate.meanConfidence },
814
+ { key: 'confidenceTrajectory', label: 'Confidence Trajectory', extract: p => p.structural.aggregate.confidenceTrajectory },
815
+ { key: 'meanFeedbackLength', label: 'Mean Feedback Length', extract: p => p.structural.aggregate.meanFeedbackLength },
816
+ ];
817
+
818
+ for (const metric of structuralMetrics) {
819
+ const baseVals = analysis.byCondition.base.map(metric.extract).filter(v => v != null);
820
+ const recogVals = analysis.byCondition.recognition.map(metric.extract).filter(v => v != null);
821
+
822
+ analysis.structural[metric.key] = {
823
+ label: metric.label,
824
+ base: { n: baseVals.length, mean: mean(baseVals), sd: std(baseVals) },
825
+ recognition: { n: recogVals.length, mean: mean(recogVals), sd: std(recogVals) },
826
+ d: cohensD(recogVals, baseVals),
827
+ welch: welchTTest(recogVals, baseVals),
828
+ };
829
+
830
+ // Per-persona breakdown
831
+ const byPersona = {};
832
+ for (const [persona, pProfiles] of Object.entries(analysis.byPersona)) {
833
+ const baseP = pProfiles.filter(p => p.condition === 'base').map(metric.extract).filter(v => v != null);
834
+ const recogP = pProfiles.filter(p => p.condition === 'recognition').map(metric.extract).filter(v => v != null);
835
+ byPersona[persona] = {
836
+ base: { n: baseP.length, mean: mean(baseP), sd: std(baseP) },
837
+ recognition: { n: recogP.length, mean: mean(recogP), sd: std(recogP) },
838
+ d: cohensD(recogP, baseP),
839
+ };
840
+ }
841
+ analysis.structural[metric.key].byPersona = byPersona;
842
+ }
843
+
844
+ // ── Intervention Type Distribution (Categorical) ────────────────────
845
+
846
+ const interventionCounts = { base: {}, recognition: {} };
847
+ for (const p of profiles) {
848
+ const itd = p.structural.aggregate.totalInterventions;
849
+ for (const [type, count] of Object.entries(itd)) {
850
+ interventionCounts[p.condition][type] = (interventionCounts[p.condition][type] || 0) + count;
851
+ }
852
+ }
853
+ analysis.structural.interventionDistribution = interventionCounts;
854
+
855
+ // Chi-square on intervention types
856
+ const allIntTypes = [...new Set([
857
+ ...Object.keys(interventionCounts.base),
858
+ ...Object.keys(interventionCounts.recognition),
859
+ ])];
860
+ if (allIntTypes.length > 1) {
861
+ const observed = allIntTypes.map(t => [
862
+ interventionCounts.base[t] || 0,
863
+ interventionCounts.recognition[t] || 0,
864
+ ]);
865
+ analysis.structural.interventionChiSquare = {
866
+ ...chiSquareTest(observed),
867
+ types: allIntTypes,
868
+ };
869
+ }
870
+
871
+ // ── Learner Action Trajectory (Categorical) ─────────────────────────
872
+
873
+ const learnerActionCounts = { base: {}, recognition: {} };
874
+ for (const p of profiles) {
875
+ for (const action of p.structural.aggregate.learnerActionSequence) {
876
+ const normalized = action.replace(/^Learner:\s*/, '');
877
+ learnerActionCounts[p.condition][normalized] =
878
+ (learnerActionCounts[p.condition][normalized] || 0) + 1;
879
+ }
880
+ }
881
+ analysis.structural.learnerActionDistribution = learnerActionCounts;
882
+
883
+ // ── LLM-Coded Metric Aggregations ──────────────────────────────────
884
+
885
+ if (profiles[0]?.llmCoded && !profiles[0].llmCoded.stanceReversal?.skipped) {
886
+ // Stance reversals
887
+ const baseReversals = analysis.byCondition.base
888
+ .map(p => p.llmCoded?.stanceReversal?.total_reversals)
889
+ .filter(v => v != null);
890
+ const recogReversals = analysis.byCondition.recognition
891
+ .map(p => p.llmCoded?.stanceReversal?.total_reversals)
892
+ .filter(v => v != null);
893
+ analysis.llmCoded.stanceReversal = {
894
+ base: { n: baseReversals.length, mean: mean(baseReversals), sd: std(baseReversals) },
895
+ recognition: { n: recogReversals.length, mean: mean(recogReversals), sd: std(recogReversals) },
896
+ d: cohensD(recogReversals, baseReversals),
897
+ welch: welchTTest(recogReversals, baseReversals),
898
+ };
899
+
900
+ // Cross-turn memory
901
+ const baseMemory = analysis.byCondition.base
902
+ .map(p => p.llmCoded?.crossTurnMemory?.memory_rate)
903
+ .filter(v => v != null);
904
+ const recogMemory = analysis.byCondition.recognition
905
+ .map(p => p.llmCoded?.crossTurnMemory?.memory_rate)
906
+ .filter(v => v != null);
907
+ analysis.llmCoded.crossTurnMemory = {
908
+ base: { n: baseMemory.length, mean: mean(baseMemory), sd: std(baseMemory) },
909
+ recognition: { n: recogMemory.length, mean: mean(recogMemory), sd: std(recogMemory) },
910
+ d: cohensD(recogMemory, baseMemory),
911
+ welch: welchTTest(recogMemory, baseMemory),
912
+ };
913
+
914
+ // Hallucination rate
915
+ const baseHalluc = analysis.byCondition.base
916
+ .map(p => p.llmCoded?.hallucinationCorrection?.hallucination_rate)
917
+ .filter(v => v != null);
918
+ const recogHalluc = analysis.byCondition.recognition
919
+ .map(p => p.llmCoded?.hallucinationCorrection?.hallucination_rate)
920
+ .filter(v => v != null);
921
+ analysis.llmCoded.hallucinationCorrection = {
922
+ base: { n: baseHalluc.length, mean: mean(baseHalluc), sd: std(baseHalluc) },
923
+ recognition: { n: recogHalluc.length, mean: mean(recogHalluc), sd: std(recogHalluc) },
924
+ d: cohensD(recogHalluc, baseHalluc),
925
+ welch: welchTTest(recogHalluc, baseHalluc),
926
+ };
927
+
928
+ // Phase transitions
929
+ const basePhase = analysis.byCondition.base
930
+ .map(p => p.llmCoded?.phaseTransition?.transition_density)
931
+ .filter(v => v != null);
932
+ const recogPhase = analysis.byCondition.recognition
933
+ .map(p => p.llmCoded?.phaseTransition?.transition_density)
934
+ .filter(v => v != null);
935
+ analysis.llmCoded.phaseTransition = {
936
+ base: { n: basePhase.length, mean: mean(basePhase), sd: std(basePhase) },
937
+ recognition: { n: recogPhase.length, mean: mean(recogPhase), sd: std(recogPhase) },
938
+ d: cohensD(recogPhase, basePhase),
939
+ welch: welchTTest(recogPhase, basePhase),
940
+ };
941
+ }
942
+
943
+ // ── Correlations: modulation metrics vs overall_score ──────────────
944
+
945
+ const scores = profiles.map(p => p.overallScore).filter(v => v != null);
946
+ const negDepths = profiles.map(p => p.structural.aggregate.meanNegationDepth);
947
+ const convergeSpeeds = profiles.map(p => p.structural.aggregate.meanRoundsToConverge);
948
+ const feedbackLens = profiles.map(p => p.structural.aggregate.meanFeedbackLength);
949
+
950
+ if (scores.length >= 5) {
951
+ analysis.correlations.negationDepth_score = pearsonR(
952
+ profiles.filter(p => p.overallScore != null).map(p => p.structural.aggregate.meanNegationDepth),
953
+ profiles.filter(p => p.overallScore != null).map(p => p.overallScore)
954
+ );
955
+ analysis.correlations.convergenceSpeed_score = pearsonR(
956
+ profiles.filter(p => p.overallScore != null).map(p => p.structural.aggregate.meanRoundsToConverge),
957
+ profiles.filter(p => p.overallScore != null).map(p => p.overallScore)
958
+ );
959
+ analysis.correlations.feedbackLength_score = pearsonR(
960
+ profiles.filter(p => p.overallScore != null).map(p => p.structural.aggregate.meanFeedbackLength),
961
+ profiles.filter(p => p.overallScore != null).map(p => p.overallScore)
962
+ );
963
+ }
964
+
965
+ return analysis;
966
+ }
967
+
968
+ // ── Report Generation ────────────────────────────────────────────────────
969
+
970
+ function generateReport(profiles, analysis, opts) {
971
+ const timestamp = new Date().toISOString();
972
+ const baseN = analysis.byCondition.base.length;
973
+ const recogN = analysis.byCondition.recognition.length;
974
+
975
+ let md = `# Dialectical Modulation Coding Analysis
976
+
977
+ **Generated:** ${timestamp}
978
+ **Run ID:** ${opts.runId}
979
+ **N:** ${analysis.n} dialogues (base=${baseN}, recognition=${recogN})
980
+ **Personas:** ${Object.keys(analysis.byPersona).join(', ')}
981
+ **Scenarios:** ${[...new Set(profiles.map(p => p.scenarioId))].join(', ')}
982
+ **Model:** ${opts.model}
983
+ **Mode:** ${opts.structuralOnly ? 'structural only' : 'full (structural + LLM-coded)'}
984
+
985
+ ## 1. Structural Metrics (Tier 1)
986
+
987
+ ### 1.1 Condition Comparison: Base vs Recognition
988
+
989
+ | Metric | Base (N=${baseN}) | Recog (N=${recogN}) | Cohen's d | Welch t | p |
990
+ |--------|-----------|-------------|-----------|---------|---|
991
+ `;
992
+
993
+ for (const [key, data] of Object.entries(analysis.structural)) {
994
+ if (key === 'interventionDistribution' || key === 'interventionChiSquare' || key === 'learnerActionDistribution') continue;
995
+ const bStr = `${data.base.mean.toFixed(2)} (${data.base.sd.toFixed(2)})`;
996
+ const rStr = `${data.recognition.mean.toFixed(2)} (${data.recognition.sd.toFixed(2)})`;
997
+ const pStr = data.welch.p < 0.001 ? '<.001' : data.welch.p.toFixed(3);
998
+ md += `| ${data.label} | ${bStr} | ${rStr} | ${data.d.toFixed(2)} | ${data.welch.t.toFixed(2)} | ${pStr} |\n`;
999
+ }
1000
+
1001
+ // Per-persona breakdown
1002
+ md += `\n### 1.2 Per-Persona Breakdown\n`;
1003
+ for (const [persona, pProfiles] of Object.entries(analysis.byPersona)) {
1004
+ const pBase = pProfiles.filter(p => p.condition === 'base').length;
1005
+ const pRecog = pProfiles.filter(p => p.condition === 'recognition').length;
1006
+ md += `\n#### ${persona} (base=${pBase}, recog=${pRecog})\n\n`;
1007
+ md += `| Metric | Base | Recog | d |\n|--------|------|-------|---|\n`;
1008
+
1009
+ for (const [key, data] of Object.entries(analysis.structural)) {
1010
+ if (!data.byPersona || !data.byPersona[persona]) continue;
1011
+ const bp = data.byPersona[persona];
1012
+ md += `| ${data.label} | ${bp.base.mean.toFixed(2)} | ${bp.recognition.mean.toFixed(2)} | ${bp.d.toFixed(2)} |\n`;
1013
+ }
1014
+ }
1015
+
1016
+ // Intervention distribution
1017
+ md += `\n### 1.3 Intervention Type Distribution\n\n`;
1018
+ md += `| Type | Base | Recognition |\n|------|------|-------------|\n`;
1019
+ const allTypes = [...new Set([
1020
+ ...Object.keys(analysis.structural.interventionDistribution?.base || {}),
1021
+ ...Object.keys(analysis.structural.interventionDistribution?.recognition || {}),
1022
+ ])];
1023
+ for (const type of allTypes) {
1024
+ const b = analysis.structural.interventionDistribution?.base?.[type] || 0;
1025
+ const r = analysis.structural.interventionDistribution?.recognition?.[type] || 0;
1026
+ md += `| ${type} | ${b} | ${r} |\n`;
1027
+ }
1028
+ if (analysis.structural.interventionChiSquare) {
1029
+ const cs = analysis.structural.interventionChiSquare;
1030
+ const pStr = cs.p < 0.001 ? 'p < .001' : `p = ${cs.p.toFixed(3)}`;
1031
+ md += `\n**Chi-square:** χ²(${cs.df}) = ${cs.chi2.toFixed(2)}, ${pStr}, V = ${cs.cramersV.toFixed(3)}\n`;
1032
+ }
1033
+
1034
+ // Learner action distribution
1035
+ md += `\n### 1.4 Learner Action Distribution\n\n`;
1036
+ md += `| Action | Base | Recognition |\n|--------|------|-------------|\n`;
1037
+ const allActions = [...new Set([
1038
+ ...Object.keys(analysis.structural.learnerActionDistribution?.base || {}),
1039
+ ...Object.keys(analysis.structural.learnerActionDistribution?.recognition || {}),
1040
+ ])];
1041
+ for (const action of allActions) {
1042
+ const b = analysis.structural.learnerActionDistribution?.base?.[action] || 0;
1043
+ const r = analysis.structural.learnerActionDistribution?.recognition?.[action] || 0;
1044
+ md += `| ${action} | ${b} | ${r} |\n`;
1045
+ }
1046
+
1047
+ // Correlations
1048
+ md += `\n### 1.5 Correlations with Overall Score\n\n`;
1049
+ md += `| Modulation Metric | r | p |\n|-------------------|---|---|\n`;
1050
+ for (const [key, corr] of Object.entries(analysis.correlations)) {
1051
+ const pStr = corr.p < 0.001 ? '<.001' : corr.p.toFixed(3);
1052
+ md += `| ${key.replace(/_/g, ' ')} | ${corr.r.toFixed(3)} | ${pStr} |\n`;
1053
+ }
1054
+
1055
+ // LLM-coded metrics
1056
+ if (!opts.structuralOnly && Object.keys(analysis.llmCoded).length > 0) {
1057
+ md += `\n## 2. LLM-Coded Metrics (Tier 2)\n\n`;
1058
+ md += `| Metric | Base | Recog | Cohen's d | Welch t | p |\n`;
1059
+ md += `|--------|------|-------|-----------|---------|---|\n`;
1060
+
1061
+ const llmLabels = {
1062
+ stanceReversal: 'Stance Reversals (count)',
1063
+ crossTurnMemory: 'Cross-Turn Memory Rate',
1064
+ hallucinationCorrection: 'Hallucination Rate',
1065
+ phaseTransition: 'Phase Transition Density',
1066
+ };
1067
+
1068
+ for (const [key, data] of Object.entries(analysis.llmCoded)) {
1069
+ if (!data.base) continue;
1070
+ const bStr = `${data.base.mean.toFixed(3)} (${data.base.sd.toFixed(3)})`;
1071
+ const rStr = `${data.recognition.mean.toFixed(3)} (${data.recognition.sd.toFixed(3)})`;
1072
+ const pStr = data.welch.p < 0.001 ? '<.001' : data.welch.p.toFixed(3);
1073
+ md += `| ${llmLabels[key] || key} | ${bStr} | ${rStr} | ${data.d.toFixed(2)} | ${data.welch.t.toFixed(2)} | ${pStr} |\n`;
1074
+ }
1075
+ }
1076
+
1077
+ // Per-cell summary table
1078
+ md += `\n## 3. Per-Cell Summary\n\n`;
1079
+ md += `| Cell | N | Mean Score | Mean Neg Depth | Mean Rounds | Mean Confidence |\n`;
1080
+ md += `|------|---|------------|----------------|-------------|------------------|\n`;
1081
+ for (const [cellKey, cellProfiles] of Object.entries(analysis.byCell)) {
1082
+ const scores = cellProfiles.map(p => p.overallScore).filter(v => v != null);
1083
+ const negDepths = cellProfiles.map(p => p.structural.aggregate.meanNegationDepth);
1084
+ const rounds = cellProfiles.map(p => p.structural.aggregate.meanRoundsToConverge);
1085
+ const confs = cellProfiles.map(p => p.structural.aggregate.meanConfidence).filter(v => v != null);
1086
+ md += `| ${cellKey} | ${cellProfiles.length} | ${mean(scores).toFixed(1)} | ${mean(negDepths).toFixed(2)} | ${mean(rounds).toFixed(2)} | ${confs.length > 0 ? mean(confs).toFixed(3) : 'N/A'} |\n`;
1087
+ }
1088
+
1089
+ // Exemplar dialogues
1090
+ md += `\n## 4. Exemplar Dialogues\n\n`;
1091
+ // Highest and lowest negation depth
1092
+ const sorted = [...profiles].sort((a, b) =>
1093
+ b.structural.aggregate.totalNegations - a.structural.aggregate.totalNegations
1094
+ );
1095
+ if (sorted.length > 0) {
1096
+ const high = sorted[0];
1097
+ const low = sorted[sorted.length - 1];
1098
+ md += `**Highest negation depth** (${high.structural.aggregate.totalNegations} total negations):\n`;
1099
+ md += `- ID: ${high.id}, ${high.condition}/${high.persona}, score=${high.overallScore?.toFixed(1)}\n`;
1100
+ md += `- Turns: ${high.structural.totalTurns}, convergence trajectory: ${high.structural.aggregate.convergenceTrajectory}\n\n`;
1101
+ md += `**Lowest negation depth** (${low.structural.aggregate.totalNegations} total negations):\n`;
1102
+ md += `- ID: ${low.id}, ${low.condition}/${low.persona}, score=${low.overallScore?.toFixed(1)}\n`;
1103
+ md += `- Turns: ${low.structural.totalTurns}, convergence trajectory: ${low.structural.aggregate.convergenceTrajectory}\n`;
1104
+ }
1105
+
1106
+ return md;
1107
+ }
1108
+
1109
+ // ── CLI ──────────────────────────────────────────────────────────────────
1110
+
1111
+ function parseArgs() {
1112
+ const args = process.argv.slice(2);
1113
+ const opts = {
1114
+ model: 'claude-code',
1115
+ runId: DEFAULT_RUN_ID,
1116
+ structuralOnly: false,
1117
+ };
1118
+ for (let i = 0; i < args.length; i++) {
1119
+ switch (args[i]) {
1120
+ case '--model': opts.model = args[++i]; break;
1121
+ case '--run-id': opts.runId = args[++i]; break;
1122
+ case '--structural-only': opts.structuralOnly = true; break;
1123
+ case '--help':
1124
+ console.log(`Usage: node scripts/code-dialectical-modulation.js [options]
1125
+
1126
+ Options:
1127
+ --model <model> Model for LLM coding (default: claude-code)
1128
+ claude-code — Claude Code CLI (subscription, free)
1129
+ haiku — OpenRouter Haiku
1130
+ sonnet — OpenRouter Sonnet
1131
+ --run-id <id> Run ID (default: ${DEFAULT_RUN_ID})
1132
+ --structural-only Skip LLM coding, emit only structural metrics
1133
+ --help Show this help`);
1134
+ process.exit(0);
1135
+ }
1136
+ }
1137
+ return opts;
1138
+ }
1139
+
1140
+ // ── Main ─────────────────────────────────────────────────────────────────
1141
+
1142
+ async function main() {
1143
+ const opts = parseArgs();
1144
+
1145
+ const dbPath = path.join(process.cwd(), 'data', 'evaluations.db');
1146
+ if (!fs.existsSync(dbPath)) {
1147
+ console.error('Database not found:', dbPath);
1148
+ process.exit(1);
1149
+ }
1150
+
1151
+ const db = new Database(dbPath, { readonly: true });
1152
+
1153
+ console.log('='.repeat(70));
1154
+ console.log('DIALECTICAL MODULATION CODING');
1155
+ console.log('='.repeat(70));
1156
+ console.log(`Model: ${opts.model} | Run ID: ${opts.runId} | Mode: ${opts.structuralOnly ? 'structural only' : 'full'}`);
1157
+
1158
+ // Load rows
1159
+ const rows = loadRows(db, opts.runId);
1160
+ console.log(`\nLoaded ${rows.length} rows with dialogue IDs`);
1161
+
1162
+ if (rows.length === 0) {
1163
+ console.error('No rows found.');
1164
+ db.close();
1165
+ return;
1166
+ }
1167
+
1168
+ // Summary
1169
+ const condCounts = { base: 0, recognition: 0 };
1170
+ const personaCounts = {};
1171
+ for (const row of rows) {
1172
+ const { condition, persona } = parseCondition(row.profile_name);
1173
+ condCounts[condition]++;
1174
+ personaCounts[persona] = (personaCounts[persona] || 0) + 1;
1175
+ }
1176
+ console.log(` Base: ${condCounts.base}, Recognition: ${condCounts.recognition}`);
1177
+ console.log(` Personas: ${Object.entries(personaCounts).map(([k, v]) => `${k}=${v}`).join(', ')}`);
1178
+
1179
+ // Ensure exports directory
1180
+ const exportsDir = path.join(process.cwd(), 'exports');
1181
+ if (!fs.existsSync(exportsDir)) {
1182
+ fs.mkdirSync(exportsDir, { recursive: true });
1183
+ }
1184
+
1185
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
1186
+
1187
+ // Process each dialogue
1188
+ const profiles = [];
1189
+ let loadErrors = 0;
1190
+ let llmErrors = 0;
1191
+ const startTime = Date.now();
1192
+
1193
+ for (let i = 0; i < rows.length; i++) {
1194
+ const row = rows[i];
1195
+ const progress = `[${i + 1}/${rows.length}]`;
1196
+
1197
+ // Load dialogue log
1198
+ const dialogueLog = loadDialogueLog(row.dialogue_id);
1199
+ if (!dialogueLog || !dialogueLog.dialogueTrace) {
1200
+ process.stdout.write(` ${progress} SKIP ${row.dialogue_id}: no dialogue log\n`);
1201
+ loadErrors++;
1202
+ continue;
1203
+ }
1204
+
1205
+ // Segment trace by turn
1206
+ const turns = segmentTraceByTurn(dialogueLog.dialogueTrace);
1207
+ if (turns.length === 0) {
1208
+ process.stdout.write(` ${progress} SKIP ${row.dialogue_id}: no turns found\n`);
1209
+ loadErrors++;
1210
+ continue;
1211
+ }
1212
+
1213
+ // Extract structural metrics
1214
+ const structural = extractStructuralMetrics(dialogueLog, turns);
1215
+
1216
+ // Extract LLM-coded metrics (if not structural-only)
1217
+ let llmCoded = null;
1218
+ if (!opts.structuralOnly) {
1219
+ process.stdout.write(` ${progress} ${row.dialogue_id} (${turns.length} turns) — LLM coding...`);
1220
+ llmCoded = await extractLLMCodedMetrics(turns, opts.model);
1221
+ if (llmCoded.errors.length > 0) {
1222
+ llmErrors += llmCoded.errors.length;
1223
+ console.log(` ${llmCoded.errors.length} errors`);
1224
+ } else {
1225
+ console.log(' done');
1226
+ }
1227
+ } else {
1228
+ process.stdout.write(` ${progress} ${row.dialogue_id} (${turns.length} turns) — structural\n`);
1229
+ }
1230
+
1231
+ const profile = buildModulationProfile(row, dialogueLog, structural, llmCoded);
1232
+ profiles.push(profile);
1233
+ }
1234
+
1235
+ const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
1236
+ console.log(`\nProcessing complete: ${profiles.length} profiles, ${loadErrors} load errors, ${llmErrors} LLM errors, ${elapsed}s`);
1237
+
1238
+ if (profiles.length === 0) {
1239
+ console.error('No profiles generated.');
1240
+ db.close();
1241
+ return;
1242
+ }
1243
+
1244
+ // Analyze
1245
+ const analysis = analyzeAggregateResults(profiles);
1246
+
1247
+ // Write outputs
1248
+ const jsonPath = path.join(exportsDir, `dialectical-modulation-${timestamp}.json`);
1249
+ fs.writeFileSync(jsonPath, JSON.stringify({
1250
+ generated: new Date().toISOString(),
1251
+ model: opts.model,
1252
+ runId: opts.runId,
1253
+ mode: opts.structuralOnly ? 'structural' : 'full',
1254
+ n: profiles.length,
1255
+ loadErrors,
1256
+ llmErrors,
1257
+ profiles,
1258
+ analysis,
1259
+ }, null, 2));
1260
+ console.log(`\nJSON: ${jsonPath}`);
1261
+
1262
+ const mdReport = generateReport(profiles, analysis, opts);
1263
+ const mdPath = path.join(exportsDir, `dialectical-modulation-${timestamp}.md`);
1264
+ fs.writeFileSync(mdPath, mdReport);
1265
+ console.log(`Markdown: ${mdPath}`);
1266
+
1267
+ // Print summary
1268
+ console.log('\n' + '─'.repeat(70));
1269
+ console.log('STRUCTURAL METRICS SUMMARY: Base vs Recognition');
1270
+ console.log('─'.repeat(70));
1271
+ console.log(`${'Metric'.padEnd(30)} ${'Base'.padEnd(14)} ${'Recog'.padEnd(14)} ${'d'.padEnd(8)} p`);
1272
+ console.log('─'.repeat(70));
1273
+
1274
+ for (const [key, data] of Object.entries(analysis.structural)) {
1275
+ if (key === 'interventionDistribution' || key === 'interventionChiSquare' || key === 'learnerActionDistribution') continue;
1276
+ const bStr = data.base.mean.toFixed(2);
1277
+ const rStr = data.recognition.mean.toFixed(2);
1278
+ const pStr = data.welch.p < 0.001 ? '<.001' : data.welch.p.toFixed(3);
1279
+ console.log(` ${data.label.padEnd(28)} ${bStr.padEnd(14)} ${rStr.padEnd(14)} ${data.d.toFixed(2).padEnd(8)} ${pStr}`);
1280
+ }
1281
+
1282
+ // Correlations
1283
+ if (Object.keys(analysis.correlations).length > 0) {
1284
+ console.log('\n' + '─'.repeat(70));
1285
+ console.log('CORRELATIONS WITH OVERALL SCORE');
1286
+ console.log('─'.repeat(70));
1287
+ for (const [key, corr] of Object.entries(analysis.correlations)) {
1288
+ const pStr = corr.p < 0.001 ? '<.001' : corr.p.toFixed(3);
1289
+ console.log(` ${key.replace(/_/g, ' ').padEnd(35)} r = ${corr.r.toFixed(3).padEnd(8)} p = ${pStr}`);
1290
+ }
1291
+ }
1292
+
1293
+ // LLM summary
1294
+ if (!opts.structuralOnly && Object.keys(analysis.llmCoded).length > 0) {
1295
+ console.log('\n' + '─'.repeat(70));
1296
+ console.log('LLM-CODED METRICS SUMMARY: Base vs Recognition');
1297
+ console.log('─'.repeat(70));
1298
+
1299
+ const llmLabels = {
1300
+ stanceReversal: 'Stance Reversals',
1301
+ crossTurnMemory: 'Cross-Turn Memory',
1302
+ hallucinationCorrection: 'Hallucination Rate',
1303
+ phaseTransition: 'Phase Transitions',
1304
+ };
1305
+
1306
+ for (const [key, data] of Object.entries(analysis.llmCoded)) {
1307
+ if (!data.base) continue;
1308
+ const pStr = data.welch.p < 0.001 ? '<.001' : data.welch.p.toFixed(3);
1309
+ console.log(` ${(llmLabels[key] || key).padEnd(28)} base=${data.base.mean.toFixed(3).padEnd(8)} recog=${data.recognition.mean.toFixed(3).padEnd(8)} d=${data.d.toFixed(2).padEnd(8)} p=${pStr}`);
1310
+ }
1311
+ }
1312
+
1313
+ db.close();
1314
+ console.log('\nDone.');
1315
+ }
1316
+
1317
+ main().catch(err => {
1318
+ console.error('Fatal error:', err);
1319
+ process.exit(1);
1320
+ });