magi-ai 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (300) hide show
  1. package/LICENSE +21 -0
  2. package/README.ja.md +377 -0
  3. package/README.md +377 -0
  4. package/dist/bin/magi-benchmark.d.ts +14 -0
  5. package/dist/bin/magi-benchmark.js +93 -0
  6. package/dist/bin/magi-mcp.d.ts +8 -0
  7. package/dist/bin/magi-mcp.js +28 -0
  8. package/dist/bin/magi.d.ts +2 -0
  9. package/dist/bin/magi.js +634 -0
  10. package/dist/src/adapters/base.d.ts +34 -0
  11. package/dist/src/adapters/base.js +149 -0
  12. package/dist/src/adapters/claude.d.ts +29 -0
  13. package/dist/src/adapters/claude.js +65 -0
  14. package/dist/src/adapters/codex.d.ts +21 -0
  15. package/dist/src/adapters/codex.js +41 -0
  16. package/dist/src/adapters/gemini.d.ts +18 -0
  17. package/dist/src/adapters/gemini.js +31 -0
  18. package/dist/src/adapters/registry.d.ts +19 -0
  19. package/dist/src/adapters/registry.js +59 -0
  20. package/dist/src/audit/hash-chain.d.ts +21 -0
  21. package/dist/src/audit/hash-chain.js +70 -0
  22. package/dist/src/audit/types.d.ts +25 -0
  23. package/dist/src/audit/types.js +1 -0
  24. package/dist/src/audit/writer.d.ts +18 -0
  25. package/dist/src/audit/writer.js +100 -0
  26. package/dist/src/benchmark/golden-tasks.d.ts +9 -0
  27. package/dist/src/benchmark/golden-tasks.js +476 -0
  28. package/dist/src/benchmark/reporter.d.ts +5 -0
  29. package/dist/src/benchmark/reporter.js +107 -0
  30. package/dist/src/benchmark/runner.d.ts +30 -0
  31. package/dist/src/benchmark/runner.js +224 -0
  32. package/dist/src/benchmark/scorer.d.ts +12 -0
  33. package/dist/src/benchmark/scorer.js +124 -0
  34. package/dist/src/benchmark/types.d.ts +54 -0
  35. package/dist/src/benchmark/types.js +1 -0
  36. package/dist/src/cache/deliberation-cache.d.ts +49 -0
  37. package/dist/src/cache/deliberation-cache.js +127 -0
  38. package/dist/src/cli/commands/config-cmd.d.ts +11 -0
  39. package/dist/src/cli/commands/config-cmd.js +190 -0
  40. package/dist/src/cli/commands/demo.d.ts +12 -0
  41. package/dist/src/cli/commands/demo.js +66 -0
  42. package/dist/src/cli/commands/setup.d.ts +7 -0
  43. package/dist/src/cli/commands/setup.js +182 -0
  44. package/dist/src/cli/i18n.d.ts +89 -0
  45. package/dist/src/cli/i18n.js +176 -0
  46. package/dist/src/cli/interactive-select.d.ts +27 -0
  47. package/dist/src/cli/interactive-select.js +130 -0
  48. package/dist/src/cli/tui-setup.d.ts +24 -0
  49. package/dist/src/cli/tui-setup.js +42 -0
  50. package/dist/src/config/cli-detector.d.ts +37 -0
  51. package/dist/src/config/cli-detector.js +99 -0
  52. package/dist/src/config/user-config.d.ts +81 -0
  53. package/dist/src/config/user-config.js +134 -0
  54. package/dist/src/context/auto-collector.d.ts +43 -0
  55. package/dist/src/context/auto-collector.js +337 -0
  56. package/dist/src/context/manager.d.ts +35 -0
  57. package/dist/src/context/manager.js +162 -0
  58. package/dist/src/context/serializer.d.ts +20 -0
  59. package/dist/src/context/serializer.js +52 -0
  60. package/dist/src/demo/recorded-deliberation.d.ts +13 -0
  61. package/dist/src/demo/recorded-deliberation.js +277 -0
  62. package/dist/src/engine/angel-detector.d.ts +83 -0
  63. package/dist/src/engine/angel-detector.js +334 -0
  64. package/dist/src/engine/at-field.d.ts +40 -0
  65. package/dist/src/engine/at-field.js +195 -0
  66. package/dist/src/engine/berserk-orchestrator.d.ts +66 -0
  67. package/dist/src/engine/berserk-orchestrator.js +378 -0
  68. package/dist/src/engine/change-metrics.d.ts +56 -0
  69. package/dist/src/engine/change-metrics.js +214 -0
  70. package/dist/src/engine/consensus.d.ts +20 -0
  71. package/dist/src/engine/consensus.js +146 -0
  72. package/dist/src/engine/dead-sea-scrolls.d.ts +132 -0
  73. package/dist/src/engine/dead-sea-scrolls.js +610 -0
  74. package/dist/src/engine/drift-detector.d.ts +39 -0
  75. package/dist/src/engine/drift-detector.js +225 -0
  76. package/dist/src/engine/dummy-plug.d.ts +44 -0
  77. package/dist/src/engine/dummy-plug.js +190 -0
  78. package/dist/src/engine/engram-manager.d.ts +55 -0
  79. package/dist/src/engine/engram-manager.js +306 -0
  80. package/dist/src/engine/events.d.ts +130 -0
  81. package/dist/src/engine/events.js +44 -0
  82. package/dist/src/engine/gospel.d.ts +30 -0
  83. package/dist/src/engine/gospel.js +129 -0
  84. package/dist/src/engine/hallucination-detector.d.ts +33 -0
  85. package/dist/src/engine/hallucination-detector.js +215 -0
  86. package/dist/src/engine/human-resolver.d.ts +19 -0
  87. package/dist/src/engine/human-resolver.js +89 -0
  88. package/dist/src/engine/instrumentality.d.ts +64 -0
  89. package/dist/src/engine/instrumentality.js +297 -0
  90. package/dist/src/engine/iruel-battle.d.ts +79 -0
  91. package/dist/src/engine/iruel-battle.js +319 -0
  92. package/dist/src/engine/kernel/deliberation-kernel.d.ts +12 -0
  93. package/dist/src/engine/kernel/deliberation-kernel.js +303 -0
  94. package/dist/src/engine/kernel/index.d.ts +8 -0
  95. package/dist/src/engine/kernel/index.js +7 -0
  96. package/dist/src/engine/kernel/phase-runner.d.ts +10 -0
  97. package/dist/src/engine/kernel/phase-runner.js +155 -0
  98. package/dist/src/engine/kernel/post-processor.d.ts +17 -0
  99. package/dist/src/engine/kernel/post-processor.js +131 -0
  100. package/dist/src/engine/kernel/types.d.ts +107 -0
  101. package/dist/src/engine/kernel/types.js +1 -0
  102. package/dist/src/engine/kernel/unit-executor.d.ts +6 -0
  103. package/dist/src/engine/kernel/unit-executor.js +132 -0
  104. package/dist/src/engine/lcl-manager.d.ts +44 -0
  105. package/dist/src/engine/lcl-manager.js +143 -0
  106. package/dist/src/engine/middleware/cache.d.ts +7 -0
  107. package/dist/src/engine/middleware/cache.js +29 -0
  108. package/dist/src/engine/middleware/chain.d.ts +18 -0
  109. package/dist/src/engine/middleware/chain.js +45 -0
  110. package/dist/src/engine/middleware/firewall.d.ts +8 -0
  111. package/dist/src/engine/middleware/firewall.js +24 -0
  112. package/dist/src/engine/middleware/index.d.ts +4 -0
  113. package/dist/src/engine/middleware/index.js +3 -0
  114. package/dist/src/engine/middleware/types.d.ts +43 -0
  115. package/dist/src/engine/middleware/types.js +1 -0
  116. package/dist/src/engine/nebuchadnezzar-key.d.ts +61 -0
  117. package/dist/src/engine/nebuchadnezzar-key.js +203 -0
  118. package/dist/src/engine/neon-genesis.d.ts +52 -0
  119. package/dist/src/engine/neon-genesis.js +203 -0
  120. package/dist/src/engine/objective-judge.d.ts +53 -0
  121. package/dist/src/engine/objective-judge.js +214 -0
  122. package/dist/src/engine/offline-mode.d.ts +18 -0
  123. package/dist/src/engine/offline-mode.js +46 -0
  124. package/dist/src/engine/orchestrator.d.ts +79 -0
  125. package/dist/src/engine/orchestrator.js +58 -0
  126. package/dist/src/engine/secret-cipher.d.ts +26 -0
  127. package/dist/src/engine/secret-cipher.js +114 -0
  128. package/dist/src/engine/seele-council.d.ts +90 -0
  129. package/dist/src/engine/seele-council.js +482 -0
  130. package/dist/src/engine/self-destruct.d.ts +61 -0
  131. package/dist/src/engine/self-destruct.js +231 -0
  132. package/dist/src/engine/self-evolution.d.ts +64 -0
  133. package/dist/src/engine/self-evolution.js +368 -0
  134. package/dist/src/engine/sync-rate.d.ts +45 -0
  135. package/dist/src/engine/sync-rate.js +151 -0
  136. package/dist/src/engine/type666-firewall.d.ts +76 -0
  137. package/dist/src/engine/type666-firewall.js +343 -0
  138. package/dist/src/engine/umbilical-cable.d.ts +41 -0
  139. package/dist/src/engine/umbilical-cable.js +192 -0
  140. package/dist/src/index.d.ts +106 -0
  141. package/dist/src/index.js +426 -0
  142. package/dist/src/mcp/server.d.ts +38 -0
  143. package/dist/src/mcp/server.js +196 -0
  144. package/dist/src/metrics/token-tracker.d.ts +38 -0
  145. package/dist/src/metrics/token-tracker.js +112 -0
  146. package/dist/src/parsers/json-extractor.d.ts +9 -0
  147. package/dist/src/parsers/json-extractor.js +239 -0
  148. package/dist/src/parsers/opinion-schema.d.ts +81 -0
  149. package/dist/src/parsers/opinion-schema.js +147 -0
  150. package/dist/src/parsers/unstructured-parser.d.ts +20 -0
  151. package/dist/src/parsers/unstructured-parser.js +122 -0
  152. package/dist/src/pipelines/architecture.d.ts +10 -0
  153. package/dist/src/pipelines/architecture.js +9 -0
  154. package/dist/src/pipelines/bug-analysis.d.ts +9 -0
  155. package/dist/src/pipelines/bug-analysis.js +8 -0
  156. package/dist/src/pipelines/code-review.d.ts +10 -0
  157. package/dist/src/pipelines/code-review.js +30 -0
  158. package/dist/src/pipelines/custom.d.ts +14 -0
  159. package/dist/src/pipelines/custom.js +29 -0
  160. package/dist/src/pipelines/registry.d.ts +9 -0
  161. package/dist/src/pipelines/registry.js +20 -0
  162. package/dist/src/prompts/personas.d.ts +6 -0
  163. package/dist/src/prompts/personas.js +44 -0
  164. package/dist/src/prompts/schemas.d.ts +4 -0
  165. package/dist/src/prompts/schemas.js +24 -0
  166. package/dist/src/prompts/templates.d.ts +6 -0
  167. package/dist/src/prompts/templates.js +91 -0
  168. package/dist/src/repl/accessibility.d.ts +23 -0
  169. package/dist/src/repl/accessibility.js +46 -0
  170. package/dist/src/repl/banner.d.ts +4 -0
  171. package/dist/src/repl/banner.js +28 -0
  172. package/dist/src/repl/boot-animation.d.ts +13 -0
  173. package/dist/src/repl/boot-animation.js +143 -0
  174. package/dist/src/repl/completer.d.ts +21 -0
  175. package/dist/src/repl/completer.js +168 -0
  176. package/dist/src/repl/context.d.ts +24 -0
  177. package/dist/src/repl/context.js +42 -0
  178. package/dist/src/repl/display-utils.d.ts +13 -0
  179. package/dist/src/repl/display-utils.js +65 -0
  180. package/dist/src/repl/event-listener.d.ts +18 -0
  181. package/dist/src/repl/event-listener.js +112 -0
  182. package/dist/src/repl/export-formatter.d.ts +8 -0
  183. package/dist/src/repl/export-formatter.js +73 -0
  184. package/dist/src/repl/ghost-text.d.ts +31 -0
  185. package/dist/src/repl/ghost-text.js +119 -0
  186. package/dist/src/repl/handoff-animation.d.ts +15 -0
  187. package/dist/src/repl/handoff-animation.js +65 -0
  188. package/dist/src/repl/history.d.ts +16 -0
  189. package/dist/src/repl/history.js +130 -0
  190. package/dist/src/repl/job-registry.d.ts +26 -0
  191. package/dist/src/repl/job-registry.js +80 -0
  192. package/dist/src/repl/magi-repl.d.ts +72 -0
  193. package/dist/src/repl/magi-repl.js +1008 -0
  194. package/dist/src/repl/multiline-input.d.ts +45 -0
  195. package/dist/src/repl/multiline-input.js +78 -0
  196. package/dist/src/repl/prompt-builder.d.ts +19 -0
  197. package/dist/src/repl/prompt-builder.js +36 -0
  198. package/dist/src/repl/repl-state.d.ts +5 -0
  199. package/dist/src/repl/repl-state.js +19 -0
  200. package/dist/src/repl/result-display.d.ts +8 -0
  201. package/dist/src/repl/result-display.js +195 -0
  202. package/dist/src/repl/session-stats.d.ts +26 -0
  203. package/dist/src/repl/session-stats.js +119 -0
  204. package/dist/src/repl/slash-commands.d.ts +60 -0
  205. package/dist/src/repl/slash-commands.js +725 -0
  206. package/dist/src/repl/terminal-sanitize.d.ts +14 -0
  207. package/dist/src/repl/terminal-sanitize.js +19 -0
  208. package/dist/src/reporters/console.d.ts +7 -0
  209. package/dist/src/reporters/console.js +78 -0
  210. package/dist/src/reporters/json.d.ts +2 -0
  211. package/dist/src/reporters/json.js +3 -0
  212. package/dist/src/reporters/markdown.d.ts +2 -0
  213. package/dist/src/reporters/markdown.js +65 -0
  214. package/dist/src/reporters/streaming.d.ts +20 -0
  215. package/dist/src/reporters/streaming.js +178 -0
  216. package/dist/src/tui/activity-log.d.ts +23 -0
  217. package/dist/src/tui/activity-log.js +67 -0
  218. package/dist/src/tui/animations.d.ts +39 -0
  219. package/dist/src/tui/animations.js +167 -0
  220. package/dist/src/tui/ansi.d.ts +28 -0
  221. package/dist/src/tui/ansi.js +51 -0
  222. package/dist/src/tui/boot-sequence.d.ts +11 -0
  223. package/dist/src/tui/boot-sequence.js +98 -0
  224. package/dist/src/tui/colors.d.ts +101 -0
  225. package/dist/src/tui/colors.js +71 -0
  226. package/dist/src/tui/header.d.ts +24 -0
  227. package/dist/src/tui/header.js +122 -0
  228. package/dist/src/tui/index.d.ts +3 -0
  229. package/dist/src/tui/index.js +3 -0
  230. package/dist/src/tui/keypress.d.ts +25 -0
  231. package/dist/src/tui/keypress.js +95 -0
  232. package/dist/src/tui/layout.d.ts +74 -0
  233. package/dist/src/tui/layout.js +171 -0
  234. package/dist/src/tui/magi-tui.d.ts +101 -0
  235. package/dist/src/tui/magi-tui.js +754 -0
  236. package/dist/src/tui/panel.d.ts +45 -0
  237. package/dist/src/tui/panel.js +292 -0
  238. package/dist/src/tui/screen-buffer.d.ts +54 -0
  239. package/dist/src/tui/screen-buffer.js +262 -0
  240. package/dist/src/tui/status-bar.d.ts +25 -0
  241. package/dist/src/tui/status-bar.js +124 -0
  242. package/dist/src/tui/terminal-detect.d.ts +26 -0
  243. package/dist/src/tui/terminal-detect.js +44 -0
  244. package/dist/src/tui/tui-helpers.d.ts +12 -0
  245. package/dist/src/tui/tui-helpers.js +37 -0
  246. package/dist/src/types/adapter.d.ts +75 -0
  247. package/dist/src/types/adapter.js +36 -0
  248. package/dist/src/types/config.d.ts +108 -0
  249. package/dist/src/types/config.js +85 -0
  250. package/dist/src/types/consensus.d.ts +55 -0
  251. package/dist/src/types/consensus.js +17 -0
  252. package/dist/src/types/core.d.ts +178 -0
  253. package/dist/src/types/core.js +85 -0
  254. package/dist/src/types/magi-api.d.ts +62 -0
  255. package/dist/src/types/magi-api.js +7 -0
  256. package/dist/src/types/phase-h.d.ts +142 -0
  257. package/dist/src/types/phase-h.js +7 -0
  258. package/dist/src/types/phase-i.d.ts +186 -0
  259. package/dist/src/types/phase-i.js +6 -0
  260. package/dist/src/types/phase-k.d.ts +259 -0
  261. package/dist/src/types/phase-k.js +6 -0
  262. package/dist/src/types/phase-l.d.ts +199 -0
  263. package/dist/src/types/phase-l.js +6 -0
  264. package/dist/src/types/pipeline.d.ts +37 -0
  265. package/dist/src/types/pipeline.js +2 -0
  266. package/dist/src/utils/abstain-factory.d.ts +2 -0
  267. package/dist/src/utils/abstain-factory.js +18 -0
  268. package/dist/src/utils/errors.d.ts +34 -0
  269. package/dist/src/utils/errors.js +59 -0
  270. package/dist/src/utils/file-validator.d.ts +50 -0
  271. package/dist/src/utils/file-validator.js +124 -0
  272. package/dist/src/utils/fire-and-forget.d.ts +5 -0
  273. package/dist/src/utils/fire-and-forget.js +10 -0
  274. package/dist/src/utils/flag-validator.d.ts +21 -0
  275. package/dist/src/utils/flag-validator.js +79 -0
  276. package/dist/src/utils/freeze.d.ts +8 -0
  277. package/dist/src/utils/freeze.js +16 -0
  278. package/dist/src/utils/language-detector.d.ts +16 -0
  279. package/dist/src/utils/language-detector.js +159 -0
  280. package/dist/src/utils/latency-tracker.d.ts +45 -0
  281. package/dist/src/utils/latency-tracker.js +100 -0
  282. package/dist/src/utils/logger.d.ts +33 -0
  283. package/dist/src/utils/logger.js +112 -0
  284. package/dist/src/utils/process.d.ts +40 -0
  285. package/dist/src/utils/process.js +253 -0
  286. package/dist/src/utils/retry.d.ts +12 -0
  287. package/dist/src/utils/retry.js +30 -0
  288. package/dist/src/utils/safe-fs.d.ts +38 -0
  289. package/dist/src/utils/safe-fs.js +56 -0
  290. package/dist/src/utils/safe-json-parse.d.ts +15 -0
  291. package/dist/src/utils/safe-json-parse.js +49 -0
  292. package/dist/src/utils/sanitize.d.ts +14 -0
  293. package/dist/src/utils/sanitize.js +186 -0
  294. package/dist/src/utils/semaphore.d.ts +22 -0
  295. package/dist/src/utils/semaphore.js +57 -0
  296. package/dist/src/utils/shutdown.d.ts +6 -0
  297. package/dist/src/utils/shutdown.js +51 -0
  298. package/dist/src/utils/tty.d.ts +5 -0
  299. package/dist/src/utils/tty.js +7 -0
  300. package/package.json +82 -0
@@ -0,0 +1,107 @@
1
+ /**
2
+ * Generate a Markdown benchmark report comparing single-Claude vs MAGI 3-body results.
3
+ */
4
+ export function generateReport(singleResult, magiResult) {
5
+ const lines = [];
6
+ const now = new Date().toISOString();
7
+ lines.push('# MAGI Benchmark Results');
8
+ lines.push('');
9
+ lines.push('> **Note:** These results are from dry-run mode using mock adapters. They validate the scoring framework and golden task definitions, not actual AI model performance. Run `npm run benchmark` with real CLIs configured to generate production results.');
10
+ lines.push('');
11
+ lines.push(`> Generated: ${now}`);
12
+ lines.push('');
13
+ // ── Summary Table ────────────────────────────────────────────
14
+ lines.push('## Summary');
15
+ lines.push('');
16
+ lines.push('| Metric | Single Claude | MAGI 3-Body | Delta |');
17
+ lines.push('|--------|:------------:|:-----------:|:-----:|');
18
+ if (singleResult && magiResult) {
19
+ const s = singleResult.aggregate;
20
+ const m = magiResult.aggregate;
21
+ lines.push(formatRow('Weighted Score', s.weightedScore, m.weightedScore, true));
22
+ lines.push(formatRow('Vote Accuracy', s.voteAccuracy, m.voteAccuracy, true));
23
+ lines.push(formatRow('Detection Coverage', s.detectionCoverage, m.detectionCoverage, true));
24
+ lines.push(formatRow('Avg Duration (ms)', s.avgDurationMs, m.avgDurationMs, false));
25
+ }
26
+ else {
27
+ const r = singleResult ?? magiResult;
28
+ if (r) {
29
+ const a = r.aggregate;
30
+ const label = r.mode === 'single-claude' ? 'Single Claude' : 'MAGI 3-Body';
31
+ lines.push(`| Weighted Score | ${r.mode === 'single-claude' ? pct(a.weightedScore) : '-'} | ${r.mode === 'magi-3-body' ? pct(a.weightedScore) : '-'} | - |`);
32
+ lines.push(`| Vote Accuracy | ${r.mode === 'single-claude' ? pct(a.voteAccuracy) : '-'} | ${r.mode === 'magi-3-body' ? pct(a.voteAccuracy) : '-'} | - |`);
33
+ lines.push(`| Detection Coverage | ${r.mode === 'single-claude' ? pct(a.detectionCoverage) : '-'} | ${r.mode === 'magi-3-body' ? pct(a.detectionCoverage) : '-'} | - |`);
34
+ lines.push(`| Avg Duration (ms) | ${r.mode === 'single-claude' ? ms(a.avgDurationMs) : '-'} | ${r.mode === 'magi-3-body' ? ms(a.avgDurationMs) : '-'} | - |`);
35
+ void label;
36
+ }
37
+ }
38
+ lines.push('');
39
+ // ── Per-Task Results ─────────────────────────────────────────
40
+ const results = [singleResult, magiResult].filter(Boolean);
41
+ for (const result of results) {
42
+ const modeLabel = result.mode === 'single-claude' ? 'Single Claude' : 'MAGI 3-Body';
43
+ lines.push(`## ${modeLabel} — Task Details`);
44
+ lines.push('');
45
+ lines.push(`- **Run ID**: ${result.runId}`);
46
+ lines.push(`- **Git Hash**: ${result.gitHash}`);
47
+ lines.push(`- **Timestamp**: ${result.timestamp}`);
48
+ lines.push(`- **Total Duration**: ${ms(result.aggregate.totalDurationMs)}`);
49
+ lines.push('');
50
+ lines.push('| Task ID | Category | Vote | Coverage | FP Rate | Score | Duration |');
51
+ lines.push('|---------|----------|:----:|:--------:|:-------:|:-----:|:--------:|');
52
+ for (const task of result.tasks) {
53
+ const voteIcon = task.voteAccuracy === 1.0 ? 'PASS' : 'FAIL';
54
+ lines.push(`| ${task.taskId} | ${task.category} | ${voteIcon} | ${pct(task.detectionCoverage)} | ${pct(task.falsePositiveRate)} | ${pct(task.compositeScore)} | ${ms(task.durationMs)} |`);
55
+ }
56
+ lines.push('');
57
+ }
58
+ // ── Category Breakdown ───────────────────────────────────────
59
+ for (const result of results) {
60
+ const modeLabel = result.mode === 'single-claude' ? 'Single Claude' : 'MAGI 3-Body';
61
+ lines.push(`## ${modeLabel} — Category Breakdown`);
62
+ lines.push('');
63
+ const categories = new Map();
64
+ for (const task of result.tasks) {
65
+ const list = categories.get(task.category) ?? [];
66
+ list.push(task);
67
+ categories.set(task.category, list);
68
+ }
69
+ lines.push('| Category | Tasks | Avg Score | Vote Accuracy |');
70
+ lines.push('|----------|:-----:|:---------:|:-------------:|');
71
+ for (const [category, tasks] of categories) {
72
+ const avgScore = tasks.reduce((a, t) => a + t.compositeScore, 0) / tasks.length;
73
+ const avgVote = tasks.reduce((a, t) => a + t.voteAccuracy, 0) / tasks.length;
74
+ lines.push(`| ${category} | ${tasks.length} | ${pct(avgScore)} | ${pct(avgVote)} |`);
75
+ }
76
+ lines.push('');
77
+ }
78
+ return lines.join('\n');
79
+ }
80
+ // ── Formatting helpers ────────────────────────────────────────
81
+ function pct(value) {
82
+ return `${(value * 100).toFixed(1)}%`;
83
+ }
84
+ function ms(value) {
85
+ return `${Math.round(value)}ms`;
86
+ }
87
+ function formatRow(label, singleVal, magiVal, higherIsBetter) {
88
+ const isScore = label.includes('Score') || label.includes('Accuracy') || label.includes('Coverage');
89
+ const singleStr = isScore ? pct(singleVal) : ms(singleVal);
90
+ const magiStr = isScore ? pct(magiVal) : ms(magiVal);
91
+ const diff = magiVal - singleVal;
92
+ let deltaStr;
93
+ if (isScore) {
94
+ const sign = diff >= 0 ? '+' : '';
95
+ deltaStr = `${sign}${(diff * 100).toFixed(1)}pp`;
96
+ }
97
+ else {
98
+ const sign = diff >= 0 ? '+' : '';
99
+ deltaStr = `${sign}${Math.round(diff)}ms`;
100
+ }
101
+ // Highlight improvement vs degradation
102
+ const improved = higherIsBetter ? diff > 0 : diff < 0;
103
+ if (diff !== 0) {
104
+ deltaStr = improved ? `**${deltaStr}**` : deltaStr;
105
+ }
106
+ return `| ${label} | ${singleStr} | ${magiStr} | ${deltaStr} |`;
107
+ }
@@ -0,0 +1,30 @@
1
+ import type { BenchmarkResult, BenchmarkOptions } from './types.js';
2
+ export declare class BenchmarkRunner {
3
+ private readonly options;
4
+ private readonly tasks;
5
+ constructor(options?: BenchmarkOptions);
6
+ /**
7
+ * Run a single-Claude benchmark (MELCHIOR only, quorum=1, initial-opinion only).
8
+ */
9
+ runSingle(): Promise<BenchmarkResult>;
10
+ /**
11
+ * Run a MAGI 3-body benchmark (full 3-unit deliberation).
12
+ */
13
+ runMagi(): Promise<BenchmarkResult>;
14
+ /**
15
+ * Run dry-run benchmark using MockAdapters with perfect responses.
16
+ */
17
+ private runDry;
18
+ /**
19
+ * Build a fake MagiDeliberation for dry-run scoring validation.
20
+ */
21
+ private buildMockDeliberation;
22
+ /**
23
+ * Run all benchmarks and generate report.
24
+ */
25
+ runAll(): Promise<{
26
+ single: BenchmarkResult | null;
27
+ magi: BenchmarkResult | null;
28
+ report: string;
29
+ }>;
30
+ }
@@ -0,0 +1,224 @@
1
+ import { randomUUID } from 'node:crypto';
2
+ import { execFileSync } from 'node:child_process';
3
+ import { join } from 'node:path';
4
+ import { safeMkdir, safeWriteFile } from '../utils/safe-fs.js';
5
+ import { GOLDEN_TASKS } from './golden-tasks.js';
6
+ import { scoreTask, aggregateScores } from './scorer.js';
7
+ import { generateReport } from './reporter.js';
8
+ import { Magi } from '../index.js';
9
+ import { AdapterRegistry } from '../adapters/registry.js';
10
+ import { ClaudeAdapter } from '../adapters/claude.js';
11
+ import { PipelineRegistry } from '../pipelines/registry.js';
12
+ import { CodeReviewPipeline } from '../pipelines/code-review.js';
13
+ import { ArchitecturePipeline } from '../pipelines/architecture.js';
14
+ import { BugAnalysisPipeline } from '../pipelines/bug-analysis.js';
15
+ import { CustomPipeline } from '../pipelines/custom.js';
16
+ import { Orchestrator } from '../engine/orchestrator.js';
17
+ import { MockContextManager } from '../../test/helpers/mock-context-manager.js';
18
+ import { logger } from '../utils/logger.js';
19
+ function getGitHash() {
20
+ try {
21
+ return execFileSync('git', ['rev-parse', '--short', 'HEAD'], {
22
+ encoding: 'utf-8',
23
+ timeout: 5000,
24
+ }).trim();
25
+ }
26
+ catch (err) {
27
+ logger.debug('Benchmark: git hash retrieval failed', { error: String(err) });
28
+ return 'unknown';
29
+ }
30
+ }
31
+ function buildPipelineRegistry() {
32
+ const pipelines = new PipelineRegistry();
33
+ pipelines.register(new CodeReviewPipeline());
34
+ pipelines.register(new ArchitecturePipeline());
35
+ pipelines.register(new BugAnalysisPipeline());
36
+ pipelines.register(new CustomPipeline());
37
+ return pipelines;
38
+ }
39
+ /**
40
+ * Create a mock response for a golden task (for dry-run mode).
41
+ * Returns a "perfect" response that matches the expected outcome.
42
+ */
43
+ function buildMockResponse(task) {
44
+ return {
45
+ vote: task.expectation.expectedVote,
46
+ confidence: 0.85,
47
+ reasoning: `Mock analysis: ${task.expectation.rationale}`,
48
+ keyPoints: task.expectation.mustDetectPoints.map(String),
49
+ };
50
+ }
51
+ export class BenchmarkRunner {
52
+ options;
53
+ tasks;
54
+ constructor(options = {}) {
55
+ this.options = options;
56
+ if (options.category) {
57
+ this.tasks = GOLDEN_TASKS.filter(t => t.category === options.category);
58
+ }
59
+ else {
60
+ this.tasks = GOLDEN_TASKS;
61
+ }
62
+ }
63
+ /**
64
+ * Run a single-Claude benchmark (MELCHIOR only, quorum=1, initial-opinion only).
65
+ */
66
+ async runSingle() {
67
+ const runId = randomUUID();
68
+ const scores = [];
69
+ if (this.options.dryRun) {
70
+ return this.runDry('single-claude', runId);
71
+ }
72
+ // Single-Claude: Only MELCHIOR adapter, quorum=1, single phase
73
+ const adapters = new AdapterRegistry();
74
+ adapters.register(new ClaudeAdapter({ timeoutMs: 120_000, maxRetries: 2 }));
75
+ const pipelines = buildPipelineRegistry();
76
+ const context = new MockContextManager();
77
+ const orchestrator = new Orchestrator({
78
+ adapters,
79
+ pipelines,
80
+ contextManager: context,
81
+ });
82
+ for (const goldenTask of this.tasks) {
83
+ const task = {
84
+ ...goldenTask.task,
85
+ config: {
86
+ phases: ['initial-opinion'],
87
+ consensus: { quorum: 1 },
88
+ },
89
+ };
90
+ const deliberation = await orchestrator.deliberate(task);
91
+ scores.push(scoreTask(goldenTask, deliberation));
92
+ }
93
+ return {
94
+ runId,
95
+ timestamp: new Date().toISOString(),
96
+ gitHash: getGitHash(),
97
+ mode: 'single-claude',
98
+ tasks: scores,
99
+ aggregate: aggregateScores(scores),
100
+ };
101
+ }
102
+ /**
103
+ * Run a MAGI 3-body benchmark (full 3-unit deliberation).
104
+ */
105
+ async runMagi() {
106
+ const runId = randomUUID();
107
+ const scores = [];
108
+ if (this.options.dryRun) {
109
+ return this.runDry('magi-3-body', runId);
110
+ }
111
+ const magi = new Magi({ cacheEnabled: false });
112
+ for (const goldenTask of this.tasks) {
113
+ const deliberation = await magi.deliberate(goldenTask.task);
114
+ scores.push(scoreTask(goldenTask, deliberation));
115
+ }
116
+ return {
117
+ runId,
118
+ timestamp: new Date().toISOString(),
119
+ gitHash: getGitHash(),
120
+ mode: 'magi-3-body',
121
+ tasks: scores,
122
+ aggregate: aggregateScores(scores),
123
+ };
124
+ }
125
+ /**
126
+ * Run dry-run benchmark using MockAdapters with perfect responses.
127
+ */
128
+ async runDry(mode, runId) {
129
+ const scores = [];
130
+ for (const goldenTask of this.tasks) {
131
+ const mockResp = buildMockResponse(goldenTask);
132
+ const deliberation = this.buildMockDeliberation(goldenTask, mockResp, mode);
133
+ scores.push(scoreTask(goldenTask, deliberation));
134
+ }
135
+ return {
136
+ runId,
137
+ timestamp: new Date().toISOString(),
138
+ gitHash: getGitHash(),
139
+ mode,
140
+ tasks: scores,
141
+ aggregate: aggregateScores(scores),
142
+ };
143
+ }
144
+ /**
145
+ * Build a fake MagiDeliberation for dry-run scoring validation.
146
+ */
147
+ buildMockDeliberation(goldenTask, mockResp, mode) {
148
+ const units = mode === 'single-claude'
149
+ ? ['MELCHIOR']
150
+ : ['MELCHIOR', 'BALTHASAR', 'CASPER'];
151
+ const opinions = units.map(unit => ({
152
+ unit,
153
+ vote: mockResp.vote,
154
+ confidence: mockResp.confidence,
155
+ reasoning: mockResp.reasoning,
156
+ keyPoints: mockResp.keyPoints,
157
+ rawOutput: JSON.stringify(mockResp),
158
+ meta: {
159
+ durationMs: 50,
160
+ exitCode: 0,
161
+ retryCount: 0,
162
+ structuredOutput: true,
163
+ },
164
+ }));
165
+ const isApprove = mockResp.vote === 'APPROVE';
166
+ const decision = mode === 'single-claude'
167
+ ? (isApprove ? 'UNANIMOUS_APPROVE' : 'UNANIMOUS_REJECT')
168
+ : (isApprove ? 'UNANIMOUS_APPROVE' : 'UNANIMOUS_REJECT');
169
+ const now = new Date();
170
+ return {
171
+ id: randomUUID(),
172
+ task: goldenTask.task,
173
+ rounds: [{
174
+ roundNumber: 1,
175
+ phase: 'initial-opinion',
176
+ opinions,
177
+ startedAt: now,
178
+ completedAt: now,
179
+ }],
180
+ consensus: {
181
+ decision,
182
+ method: 'unanimous',
183
+ votes: {
184
+ approve: isApprove ? units : [],
185
+ reject: isApprove ? [] : units,
186
+ abstain: [],
187
+ weightedApprove: isApprove ? units.length : 0,
188
+ weightedReject: isApprove ? 0 : units.length,
189
+ },
190
+ confidence: mockResp.confidence,
191
+ summary: `Mock consensus: ${decision}`,
192
+ },
193
+ totalDurationMs: 50 * units.length,
194
+ startedAt: now,
195
+ completedAt: now,
196
+ };
197
+ }
198
+ /**
199
+ * Run all benchmarks and generate report.
200
+ */
201
+ async runAll() {
202
+ let single = null;
203
+ let magi = null;
204
+ if (!this.options.magiOnly) {
205
+ single = await this.runSingle();
206
+ }
207
+ if (!this.options.singleOnly) {
208
+ magi = await this.runMagi();
209
+ }
210
+ const report = generateReport(single, magi);
211
+ // Save results
212
+ const outputDir = this.options.outputDir ?? 'test/benchmark/results';
213
+ await safeMkdir(outputDir);
214
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
215
+ if (single) {
216
+ await safeWriteFile(join(outputDir, `single-${timestamp}.json`), JSON.stringify(single, null, 2));
217
+ }
218
+ if (magi) {
219
+ await safeWriteFile(join(outputDir, `magi-${timestamp}.json`), JSON.stringify(magi, null, 2));
220
+ }
221
+ await safeWriteFile('docs/BENCHMARK_RESULTS.md', report);
222
+ return { single, magi, report };
223
+ }
224
+ }
@@ -0,0 +1,12 @@
1
+ import type { MagiDeliberation } from '../types/core.js';
2
+ import type { GoldenTask, TaskScore, AggregateScore } from './types.js';
3
+ /**
4
+ * Score a single golden task against its deliberation result.
5
+ *
6
+ * compositeScore = voteAccuracy * 0.4 + detectionCoverage * 0.5 + (1 - falsePositiveRate) * 0.1
7
+ */
8
+ export declare function scoreTask(goldenTask: GoldenTask, deliberation: MagiDeliberation): TaskScore;
9
+ /**
10
+ * Aggregate scores across all tasks.
11
+ */
12
+ export declare function aggregateScores(scores: readonly TaskScore[]): AggregateScore;
@@ -0,0 +1,124 @@
1
+ import { getMajorityVote } from '../types/consensus.js';
2
+ /**
3
+ * Extract the effective vote from a ConsensusDecision.
4
+ */
5
+ function decisionToVote(decision) {
6
+ return getMajorityVote(decision);
7
+ }
8
+ /**
9
+ * Compute detection coverage: fraction of mustDetectPoints found in the output.
10
+ */
11
+ function computeDetectionCoverage(mustDetectPoints, searchText) {
12
+ if (mustDetectPoints.length === 0)
13
+ return 1.0;
14
+ const lower = searchText.toLowerCase();
15
+ let found = 0;
16
+ for (const point of mustDetectPoints) {
17
+ if (lower.includes(point.toLowerCase())) {
18
+ found++;
19
+ }
20
+ }
21
+ return found / mustDetectPoints.length;
22
+ }
23
+ /**
24
+ * Compute false positive rate: fraction of mustNotDetectPoints found in the output.
25
+ */
26
+ function computeFalsePositiveRate(mustNotDetectPoints, searchText) {
27
+ if (!mustNotDetectPoints || mustNotDetectPoints.length === 0)
28
+ return 0.0;
29
+ const lower = searchText.toLowerCase();
30
+ let found = 0;
31
+ for (const point of mustNotDetectPoints) {
32
+ if (lower.includes(point.toLowerCase())) {
33
+ found++;
34
+ }
35
+ }
36
+ return found / mustNotDetectPoints.length;
37
+ }
38
+ /**
39
+ * Build the combined text to search for detection points.
40
+ * Uses all opinions from the last round's keyPoints + reasoning.
41
+ */
42
+ function buildSearchText(deliberation) {
43
+ const lastRound = deliberation.rounds[deliberation.rounds.length - 1];
44
+ if (!lastRound)
45
+ return '';
46
+ const parts = [];
47
+ for (const opinion of lastRound.opinions) {
48
+ parts.push(opinion.reasoning);
49
+ parts.push(...opinion.keyPoints);
50
+ if (opinion.suggestions) {
51
+ parts.push(...opinion.suggestions);
52
+ }
53
+ }
54
+ return parts.join(' ');
55
+ }
56
+ /**
57
+ * Compute the average confidence across all opinions in the last round.
58
+ */
59
+ function computeAvgConfidence(deliberation) {
60
+ const lastRound = deliberation.rounds[deliberation.rounds.length - 1];
61
+ if (!lastRound || lastRound.opinions.length === 0)
62
+ return 0;
63
+ const sum = lastRound.opinions.reduce((acc, op) => acc + op.confidence, 0);
64
+ return sum / lastRound.opinions.length;
65
+ }
66
+ /**
67
+ * Score a single golden task against its deliberation result.
68
+ *
69
+ * compositeScore = voteAccuracy * 0.4 + detectionCoverage * 0.5 + (1 - falsePositiveRate) * 0.1
70
+ */
71
+ export function scoreTask(goldenTask, deliberation) {
72
+ const { expectation } = goldenTask;
73
+ const consensusVote = decisionToVote(deliberation.consensus.decision);
74
+ const voteAccuracy = consensusVote === expectation.expectedVote ? 1.0 : 0.0;
75
+ const searchText = buildSearchText(deliberation);
76
+ const detectionCoverage = computeDetectionCoverage(expectation.mustDetectPoints, searchText);
77
+ const falsePositiveRate = computeFalsePositiveRate(expectation.mustNotDetectPoints, searchText);
78
+ const compositeScore = voteAccuracy * 0.4 +
79
+ detectionCoverage * 0.5 +
80
+ (1 - falsePositiveRate) * 0.1;
81
+ return {
82
+ taskId: goldenTask.id,
83
+ category: goldenTask.category,
84
+ weight: expectation.weight,
85
+ voteAccuracy,
86
+ detectionCoverage,
87
+ falsePositiveRate,
88
+ compositeScore,
89
+ durationMs: deliberation.totalDurationMs,
90
+ avgConfidence: computeAvgConfidence(deliberation),
91
+ };
92
+ }
93
+ /**
94
+ * Aggregate scores across all tasks.
95
+ */
96
+ export function aggregateScores(scores) {
97
+ if (scores.length === 0) {
98
+ return {
99
+ totalScore: 0,
100
+ weightedScore: 0,
101
+ voteAccuracy: 0,
102
+ detectionCoverage: 0,
103
+ avgDurationMs: 0,
104
+ totalDurationMs: 0,
105
+ };
106
+ }
107
+ const totalWeight = scores.reduce((acc, s) => acc + s.weight, 0);
108
+ const totalScore = scores.reduce((acc, s) => acc + s.compositeScore, 0) / scores.length;
109
+ const weightedScore = totalWeight > 0
110
+ ? scores.reduce((acc, s) => acc + s.compositeScore * s.weight, 0) / totalWeight
111
+ : 0;
112
+ const voteAccuracy = scores.reduce((acc, s) => acc + s.voteAccuracy, 0) / scores.length;
113
+ const detectionCoverage = scores.reduce((acc, s) => acc + s.detectionCoverage, 0) / scores.length;
114
+ const totalDurationMs = scores.reduce((acc, s) => acc + s.durationMs, 0);
115
+ const avgDurationMs = totalDurationMs / scores.length;
116
+ return {
117
+ totalScore,
118
+ weightedScore,
119
+ voteAccuracy,
120
+ detectionCoverage,
121
+ avgDurationMs,
122
+ totalDurationMs,
123
+ };
124
+ }
@@ -0,0 +1,54 @@
1
+ import type { MagiTask, Vote } from '../types/core.js';
2
+ /** Expected outcome for a golden task */
3
+ export interface GoldenExpectation {
4
+ expectedVote: Vote;
5
+ mustDetectPoints: string[];
6
+ mustNotDetectPoints?: string[];
7
+ weight: number;
8
+ rationale: string;
9
+ }
10
+ /** A golden task with known-correct expected outcome */
11
+ export interface GoldenTask {
12
+ id: string;
13
+ task: MagiTask;
14
+ expectation: GoldenExpectation;
15
+ category: 'code-review' | 'architecture' | 'bug-analysis' | 'security';
16
+ }
17
+ /** Score for a single task evaluation */
18
+ export interface TaskScore {
19
+ taskId: string;
20
+ category: string;
21
+ weight: number;
22
+ voteAccuracy: number;
23
+ detectionCoverage: number;
24
+ falsePositiveRate: number;
25
+ compositeScore: number;
26
+ durationMs: number;
27
+ avgConfidence: number;
28
+ }
29
+ /** Aggregate scores across all tasks */
30
+ export interface AggregateScore {
31
+ totalScore: number;
32
+ weightedScore: number;
33
+ voteAccuracy: number;
34
+ detectionCoverage: number;
35
+ avgDurationMs: number;
36
+ totalDurationMs: number;
37
+ }
38
+ /** Complete benchmark result for one mode */
39
+ export interface BenchmarkResult {
40
+ runId: string;
41
+ timestamp: string;
42
+ gitHash: string;
43
+ mode: 'single-claude' | 'magi-3-body';
44
+ tasks: TaskScore[];
45
+ aggregate: AggregateScore;
46
+ }
47
+ /** Options for the benchmark runner */
48
+ export interface BenchmarkOptions {
49
+ dryRun?: boolean;
50
+ singleOnly?: boolean;
51
+ magiOnly?: boolean;
52
+ category?: string;
53
+ outputDir?: string;
54
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,49 @@
1
+ import type { MagiTask, MagiDeliberation } from '../types/core.js';
2
+ /**
3
+ * In-memory cache for deliberation results.
4
+ * Keyed by a SHA-256 hash of task attributes (type + title + description + artifacts).
5
+ */
6
+ export declare class DeliberationCache {
7
+ private readonly ttlMs;
8
+ private readonly maxEntries;
9
+ private store;
10
+ private inflight;
11
+ constructor(ttlMs?: number, // 1 hour default
12
+ maxEntries?: number);
13
+ /**
14
+ * Compute a deterministic cache key from a MagiTask.
15
+ */
16
+ computeKey(task: MagiTask): string;
17
+ /**
18
+ * Get a cached deliberation result.
19
+ * Returns null on miss or TTL expiry.
20
+ * Restores Date objects and sets fromCache=true.
21
+ */
22
+ get(task: MagiTask): MagiDeliberation | null;
23
+ /**
24
+ * Store a deliberation result in the cache.
25
+ */
26
+ set(task: MagiTask, deliberation: MagiDeliberation): void;
27
+ /**
28
+ * Get from cache or compute. Prevents cache stampede by coalescing
29
+ * concurrent requests for the same cache key.
30
+ */
31
+ getOrCompute(task: MagiTask, compute: () => Promise<MagiDeliberation>): Promise<MagiDeliberation>;
32
+ /**
33
+ * Remove all entries from the cache.
34
+ */
35
+ clear(): void;
36
+ /**
37
+ * Remove expired entries.
38
+ */
39
+ prune(): number;
40
+ /**
41
+ * Cache statistics.
42
+ */
43
+ stats(): {
44
+ size: number;
45
+ ttlMs: number;
46
+ maxEntries: number;
47
+ inflightCount: number;
48
+ };
49
+ }