npm - magi-ai - Versions diffs - 0.1.0 - Mend

magi-ai 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (300) hide show

package/LICENSE +21 -0
package/README.ja.md +377 -0
package/README.md +377 -0
package/dist/bin/magi-benchmark.d.ts +14 -0
package/dist/bin/magi-benchmark.js +93 -0
package/dist/bin/magi-mcp.d.ts +8 -0
package/dist/bin/magi-mcp.js +28 -0
package/dist/bin/magi.d.ts +2 -0
package/dist/bin/magi.js +634 -0
package/dist/src/adapters/base.d.ts +34 -0
package/dist/src/adapters/base.js +149 -0
package/dist/src/adapters/claude.d.ts +29 -0
package/dist/src/adapters/claude.js +65 -0
package/dist/src/adapters/codex.d.ts +21 -0
package/dist/src/adapters/codex.js +41 -0
package/dist/src/adapters/gemini.d.ts +18 -0
package/dist/src/adapters/gemini.js +31 -0
package/dist/src/adapters/registry.d.ts +19 -0
package/dist/src/adapters/registry.js +59 -0
package/dist/src/audit/hash-chain.d.ts +21 -0
package/dist/src/audit/hash-chain.js +70 -0
package/dist/src/audit/types.d.ts +25 -0
package/dist/src/audit/types.js +1 -0
package/dist/src/audit/writer.d.ts +18 -0
package/dist/src/audit/writer.js +100 -0
package/dist/src/benchmark/golden-tasks.d.ts +9 -0
package/dist/src/benchmark/golden-tasks.js +476 -0
package/dist/src/benchmark/reporter.d.ts +5 -0
package/dist/src/benchmark/reporter.js +107 -0
package/dist/src/benchmark/runner.d.ts +30 -0
package/dist/src/benchmark/runner.js +224 -0
package/dist/src/benchmark/scorer.d.ts +12 -0
package/dist/src/benchmark/scorer.js +124 -0
package/dist/src/benchmark/types.d.ts +54 -0
package/dist/src/benchmark/types.js +1 -0
package/dist/src/cache/deliberation-cache.d.ts +49 -0
package/dist/src/cache/deliberation-cache.js +127 -0
package/dist/src/cli/commands/config-cmd.d.ts +11 -0
package/dist/src/cli/commands/config-cmd.js +190 -0
package/dist/src/cli/commands/demo.d.ts +12 -0
package/dist/src/cli/commands/demo.js +66 -0
package/dist/src/cli/commands/setup.d.ts +7 -0
package/dist/src/cli/commands/setup.js +182 -0
package/dist/src/cli/i18n.d.ts +89 -0
package/dist/src/cli/i18n.js +176 -0
package/dist/src/cli/interactive-select.d.ts +27 -0
package/dist/src/cli/interactive-select.js +130 -0
package/dist/src/cli/tui-setup.d.ts +24 -0
package/dist/src/cli/tui-setup.js +42 -0
package/dist/src/config/cli-detector.d.ts +37 -0
package/dist/src/config/cli-detector.js +99 -0
package/dist/src/config/user-config.d.ts +81 -0
package/dist/src/config/user-config.js +134 -0
package/dist/src/context/auto-collector.d.ts +43 -0
package/dist/src/context/auto-collector.js +337 -0
package/dist/src/context/manager.d.ts +35 -0
package/dist/src/context/manager.js +162 -0
package/dist/src/context/serializer.d.ts +20 -0
package/dist/src/context/serializer.js +52 -0
package/dist/src/demo/recorded-deliberation.d.ts +13 -0
package/dist/src/demo/recorded-deliberation.js +277 -0
package/dist/src/engine/angel-detector.d.ts +83 -0
package/dist/src/engine/angel-detector.js +334 -0
package/dist/src/engine/at-field.d.ts +40 -0
package/dist/src/engine/at-field.js +195 -0
package/dist/src/engine/berserk-orchestrator.d.ts +66 -0
package/dist/src/engine/berserk-orchestrator.js +378 -0
package/dist/src/engine/change-metrics.d.ts +56 -0
package/dist/src/engine/change-metrics.js +214 -0
package/dist/src/engine/consensus.d.ts +20 -0
package/dist/src/engine/consensus.js +146 -0
package/dist/src/engine/dead-sea-scrolls.d.ts +132 -0
package/dist/src/engine/dead-sea-scrolls.js +610 -0
package/dist/src/engine/drift-detector.d.ts +39 -0
package/dist/src/engine/drift-detector.js +225 -0
package/dist/src/engine/dummy-plug.d.ts +44 -0
package/dist/src/engine/dummy-plug.js +190 -0
package/dist/src/engine/engram-manager.d.ts +55 -0
package/dist/src/engine/engram-manager.js +306 -0
package/dist/src/engine/events.d.ts +130 -0
package/dist/src/engine/events.js +44 -0
package/dist/src/engine/gospel.d.ts +30 -0
package/dist/src/engine/gospel.js +129 -0
package/dist/src/engine/hallucination-detector.d.ts +33 -0
package/dist/src/engine/hallucination-detector.js +215 -0
package/dist/src/engine/human-resolver.d.ts +19 -0
package/dist/src/engine/human-resolver.js +89 -0
package/dist/src/engine/instrumentality.d.ts +64 -0
package/dist/src/engine/instrumentality.js +297 -0
package/dist/src/engine/iruel-battle.d.ts +79 -0
package/dist/src/engine/iruel-battle.js +319 -0
package/dist/src/engine/kernel/deliberation-kernel.d.ts +12 -0
package/dist/src/engine/kernel/deliberation-kernel.js +303 -0
package/dist/src/engine/kernel/index.d.ts +8 -0
package/dist/src/engine/kernel/index.js +7 -0
package/dist/src/engine/kernel/phase-runner.d.ts +10 -0
package/dist/src/engine/kernel/phase-runner.js +155 -0
package/dist/src/engine/kernel/post-processor.d.ts +17 -0
package/dist/src/engine/kernel/post-processor.js +131 -0
package/dist/src/engine/kernel/types.d.ts +107 -0
package/dist/src/engine/kernel/types.js +1 -0
package/dist/src/engine/kernel/unit-executor.d.ts +6 -0
package/dist/src/engine/kernel/unit-executor.js +132 -0
package/dist/src/engine/lcl-manager.d.ts +44 -0
package/dist/src/engine/lcl-manager.js +143 -0
package/dist/src/engine/middleware/cache.d.ts +7 -0
package/dist/src/engine/middleware/cache.js +29 -0
package/dist/src/engine/middleware/chain.d.ts +18 -0
package/dist/src/engine/middleware/chain.js +45 -0
package/dist/src/engine/middleware/firewall.d.ts +8 -0
package/dist/src/engine/middleware/firewall.js +24 -0
package/dist/src/engine/middleware/index.d.ts +4 -0
package/dist/src/engine/middleware/index.js +3 -0
package/dist/src/engine/middleware/types.d.ts +43 -0
package/dist/src/engine/middleware/types.js +1 -0
package/dist/src/engine/nebuchadnezzar-key.d.ts +61 -0
package/dist/src/engine/nebuchadnezzar-key.js +203 -0
package/dist/src/engine/neon-genesis.d.ts +52 -0
package/dist/src/engine/neon-genesis.js +203 -0
package/dist/src/engine/objective-judge.d.ts +53 -0
package/dist/src/engine/objective-judge.js +214 -0
package/dist/src/engine/offline-mode.d.ts +18 -0
package/dist/src/engine/offline-mode.js +46 -0
package/dist/src/engine/orchestrator.d.ts +79 -0
package/dist/src/engine/orchestrator.js +58 -0
package/dist/src/engine/secret-cipher.d.ts +26 -0
package/dist/src/engine/secret-cipher.js +114 -0
package/dist/src/engine/seele-council.d.ts +90 -0
package/dist/src/engine/seele-council.js +482 -0
package/dist/src/engine/self-destruct.d.ts +61 -0
package/dist/src/engine/self-destruct.js +231 -0
package/dist/src/engine/self-evolution.d.ts +64 -0
package/dist/src/engine/self-evolution.js +368 -0
package/dist/src/engine/sync-rate.d.ts +45 -0
package/dist/src/engine/sync-rate.js +151 -0
package/dist/src/engine/type666-firewall.d.ts +76 -0
package/dist/src/engine/type666-firewall.js +343 -0
package/dist/src/engine/umbilical-cable.d.ts +41 -0
package/dist/src/engine/umbilical-cable.js +192 -0
package/dist/src/index.d.ts +106 -0
package/dist/src/index.js +426 -0
package/dist/src/mcp/server.d.ts +38 -0
package/dist/src/mcp/server.js +196 -0
package/dist/src/metrics/token-tracker.d.ts +38 -0
package/dist/src/metrics/token-tracker.js +112 -0
package/dist/src/parsers/json-extractor.d.ts +9 -0
package/dist/src/parsers/json-extractor.js +239 -0
package/dist/src/parsers/opinion-schema.d.ts +81 -0
package/dist/src/parsers/opinion-schema.js +147 -0
package/dist/src/parsers/unstructured-parser.d.ts +20 -0
package/dist/src/parsers/unstructured-parser.js +122 -0
package/dist/src/pipelines/architecture.d.ts +10 -0
package/dist/src/pipelines/architecture.js +9 -0
package/dist/src/pipelines/bug-analysis.d.ts +9 -0
package/dist/src/pipelines/bug-analysis.js +8 -0
package/dist/src/pipelines/code-review.d.ts +10 -0
package/dist/src/pipelines/code-review.js +30 -0
package/dist/src/pipelines/custom.d.ts +14 -0
package/dist/src/pipelines/custom.js +29 -0
package/dist/src/pipelines/registry.d.ts +9 -0
package/dist/src/pipelines/registry.js +20 -0
package/dist/src/prompts/personas.d.ts +6 -0
package/dist/src/prompts/personas.js +44 -0
package/dist/src/prompts/schemas.d.ts +4 -0
package/dist/src/prompts/schemas.js +24 -0
package/dist/src/prompts/templates.d.ts +6 -0
package/dist/src/prompts/templates.js +91 -0
package/dist/src/repl/accessibility.d.ts +23 -0
package/dist/src/repl/accessibility.js +46 -0
package/dist/src/repl/banner.d.ts +4 -0
package/dist/src/repl/banner.js +28 -0
package/dist/src/repl/boot-animation.d.ts +13 -0
package/dist/src/repl/boot-animation.js +143 -0
package/dist/src/repl/completer.d.ts +21 -0
package/dist/src/repl/completer.js +168 -0
package/dist/src/repl/context.d.ts +24 -0
package/dist/src/repl/context.js +42 -0
package/dist/src/repl/display-utils.d.ts +13 -0
package/dist/src/repl/display-utils.js +65 -0
package/dist/src/repl/event-listener.d.ts +18 -0
package/dist/src/repl/event-listener.js +112 -0
package/dist/src/repl/export-formatter.d.ts +8 -0
package/dist/src/repl/export-formatter.js +73 -0
package/dist/src/repl/ghost-text.d.ts +31 -0
package/dist/src/repl/ghost-text.js +119 -0
package/dist/src/repl/handoff-animation.d.ts +15 -0
package/dist/src/repl/handoff-animation.js +65 -0
package/dist/src/repl/history.d.ts +16 -0
package/dist/src/repl/history.js +130 -0
package/dist/src/repl/job-registry.d.ts +26 -0
package/dist/src/repl/job-registry.js +80 -0
package/dist/src/repl/magi-repl.d.ts +72 -0
package/dist/src/repl/magi-repl.js +1008 -0
package/dist/src/repl/multiline-input.d.ts +45 -0
package/dist/src/repl/multiline-input.js +78 -0
package/dist/src/repl/prompt-builder.d.ts +19 -0
package/dist/src/repl/prompt-builder.js +36 -0
package/dist/src/repl/repl-state.d.ts +5 -0
package/dist/src/repl/repl-state.js +19 -0
package/dist/src/repl/result-display.d.ts +8 -0
package/dist/src/repl/result-display.js +195 -0
package/dist/src/repl/session-stats.d.ts +26 -0
package/dist/src/repl/session-stats.js +119 -0
package/dist/src/repl/slash-commands.d.ts +60 -0
package/dist/src/repl/slash-commands.js +725 -0
package/dist/src/repl/terminal-sanitize.d.ts +14 -0
package/dist/src/repl/terminal-sanitize.js +19 -0
package/dist/src/reporters/console.d.ts +7 -0
package/dist/src/reporters/console.js +78 -0
package/dist/src/reporters/json.d.ts +2 -0
package/dist/src/reporters/json.js +3 -0
package/dist/src/reporters/markdown.d.ts +2 -0
package/dist/src/reporters/markdown.js +65 -0
package/dist/src/reporters/streaming.d.ts +20 -0
package/dist/src/reporters/streaming.js +178 -0
package/dist/src/tui/activity-log.d.ts +23 -0
package/dist/src/tui/activity-log.js +67 -0
package/dist/src/tui/animations.d.ts +39 -0
package/dist/src/tui/animations.js +167 -0
package/dist/src/tui/ansi.d.ts +28 -0
package/dist/src/tui/ansi.js +51 -0
package/dist/src/tui/boot-sequence.d.ts +11 -0
package/dist/src/tui/boot-sequence.js +98 -0
package/dist/src/tui/colors.d.ts +101 -0
package/dist/src/tui/colors.js +71 -0
package/dist/src/tui/header.d.ts +24 -0
package/dist/src/tui/header.js +122 -0
package/dist/src/tui/index.d.ts +3 -0
package/dist/src/tui/index.js +3 -0
package/dist/src/tui/keypress.d.ts +25 -0
package/dist/src/tui/keypress.js +95 -0
package/dist/src/tui/layout.d.ts +74 -0
package/dist/src/tui/layout.js +171 -0
package/dist/src/tui/magi-tui.d.ts +101 -0
package/dist/src/tui/magi-tui.js +754 -0
package/dist/src/tui/panel.d.ts +45 -0
package/dist/src/tui/panel.js +292 -0
package/dist/src/tui/screen-buffer.d.ts +54 -0
package/dist/src/tui/screen-buffer.js +262 -0
package/dist/src/tui/status-bar.d.ts +25 -0
package/dist/src/tui/status-bar.js +124 -0
package/dist/src/tui/terminal-detect.d.ts +26 -0
package/dist/src/tui/terminal-detect.js +44 -0
package/dist/src/tui/tui-helpers.d.ts +12 -0
package/dist/src/tui/tui-helpers.js +37 -0
package/dist/src/types/adapter.d.ts +75 -0
package/dist/src/types/adapter.js +36 -0
package/dist/src/types/config.d.ts +108 -0
package/dist/src/types/config.js +85 -0
package/dist/src/types/consensus.d.ts +55 -0
package/dist/src/types/consensus.js +17 -0
package/dist/src/types/core.d.ts +178 -0
package/dist/src/types/core.js +85 -0
package/dist/src/types/magi-api.d.ts +62 -0
package/dist/src/types/magi-api.js +7 -0
package/dist/src/types/phase-h.d.ts +142 -0
package/dist/src/types/phase-h.js +7 -0
package/dist/src/types/phase-i.d.ts +186 -0
package/dist/src/types/phase-i.js +6 -0
package/dist/src/types/phase-k.d.ts +259 -0
package/dist/src/types/phase-k.js +6 -0
package/dist/src/types/phase-l.d.ts +199 -0
package/dist/src/types/phase-l.js +6 -0
package/dist/src/types/pipeline.d.ts +37 -0
package/dist/src/types/pipeline.js +2 -0
package/dist/src/utils/abstain-factory.d.ts +2 -0
package/dist/src/utils/abstain-factory.js +18 -0
package/dist/src/utils/errors.d.ts +34 -0
package/dist/src/utils/errors.js +59 -0
package/dist/src/utils/file-validator.d.ts +50 -0
package/dist/src/utils/file-validator.js +124 -0
package/dist/src/utils/fire-and-forget.d.ts +5 -0
package/dist/src/utils/fire-and-forget.js +10 -0
package/dist/src/utils/flag-validator.d.ts +21 -0
package/dist/src/utils/flag-validator.js +79 -0
package/dist/src/utils/freeze.d.ts +8 -0
package/dist/src/utils/freeze.js +16 -0
package/dist/src/utils/language-detector.d.ts +16 -0
package/dist/src/utils/language-detector.js +159 -0
package/dist/src/utils/latency-tracker.d.ts +45 -0
package/dist/src/utils/latency-tracker.js +100 -0
package/dist/src/utils/logger.d.ts +33 -0
package/dist/src/utils/logger.js +112 -0
package/dist/src/utils/process.d.ts +40 -0
package/dist/src/utils/process.js +253 -0
package/dist/src/utils/retry.d.ts +12 -0
package/dist/src/utils/retry.js +30 -0
package/dist/src/utils/safe-fs.d.ts +38 -0
package/dist/src/utils/safe-fs.js +56 -0
package/dist/src/utils/safe-json-parse.d.ts +15 -0
package/dist/src/utils/safe-json-parse.js +49 -0
package/dist/src/utils/sanitize.d.ts +14 -0
package/dist/src/utils/sanitize.js +186 -0
package/dist/src/utils/semaphore.d.ts +22 -0
package/dist/src/utils/semaphore.js +57 -0
package/dist/src/utils/shutdown.d.ts +6 -0
package/dist/src/utils/shutdown.js +51 -0
package/dist/src/utils/tty.d.ts +5 -0
package/dist/src/utils/tty.js +7 -0
package/package.json +82 -0

package/dist/src/benchmark/reporter.js ADDED Viewed

@@ -0,0 +1,107 @@
+/**
+ * Generate a Markdown benchmark report comparing single-Claude vs MAGI 3-body results.
+ */
+export function generateReport(singleResult, magiResult) {
+    const lines = [];
+    const now = new Date().toISOString();
+    lines.push('# MAGI Benchmark Results');
+    lines.push('');
+    lines.push('> **Note:** These results are from dry-run mode using mock adapters. They validate the scoring framework and golden task definitions, not actual AI model performance. Run `npm run benchmark` with real CLIs configured to generate production results.');
+    lines.push('');
+    lines.push(`> Generated: ${now}`);
+    lines.push('');
+    // ── Summary Table ────────────────────────────────────────────
+    lines.push('## Summary');
+    lines.push('');
+    lines.push('| Metric | Single Claude | MAGI 3-Body | Delta |');
+    lines.push('|--------|:------------:|:-----------:|:-----:|');
+    if (singleResult && magiResult) {
+        const s = singleResult.aggregate;
+        const m = magiResult.aggregate;
+        lines.push(formatRow('Weighted Score', s.weightedScore, m.weightedScore, true));
+        lines.push(formatRow('Vote Accuracy', s.voteAccuracy, m.voteAccuracy, true));
+        lines.push(formatRow('Detection Coverage', s.detectionCoverage, m.detectionCoverage, true));
+        lines.push(formatRow('Avg Duration (ms)', s.avgDurationMs, m.avgDurationMs, false));
+    }
+    else {
+        const r = singleResult ?? magiResult;
+        if (r) {
+            const a = r.aggregate;
+            const label = r.mode === 'single-claude' ? 'Single Claude' : 'MAGI 3-Body';
+            lines.push(`| Weighted Score | ${r.mode === 'single-claude' ? pct(a.weightedScore) : '-'} | ${r.mode === 'magi-3-body' ? pct(a.weightedScore) : '-'} | - |`);
+            lines.push(`| Vote Accuracy | ${r.mode === 'single-claude' ? pct(a.voteAccuracy) : '-'} | ${r.mode === 'magi-3-body' ? pct(a.voteAccuracy) : '-'} | - |`);
+            lines.push(`| Detection Coverage | ${r.mode === 'single-claude' ? pct(a.detectionCoverage) : '-'} | ${r.mode === 'magi-3-body' ? pct(a.detectionCoverage) : '-'} | - |`);
+            lines.push(`| Avg Duration (ms) | ${r.mode === 'single-claude' ? ms(a.avgDurationMs) : '-'} | ${r.mode === 'magi-3-body' ? ms(a.avgDurationMs) : '-'} | - |`);
+            void label;
+        }
+    }
+    lines.push('');
+    // ── Per-Task Results ─────────────────────────────────────────
+    const results = [singleResult, magiResult].filter(Boolean);
+    for (const result of results) {
+        const modeLabel = result.mode === 'single-claude' ? 'Single Claude' : 'MAGI 3-Body';
+        lines.push(`## ${modeLabel} — Task Details`);
+        lines.push('');
+        lines.push(`- **Run ID**: ${result.runId}`);
+        lines.push(`- **Git Hash**: ${result.gitHash}`);
+        lines.push(`- **Timestamp**: ${result.timestamp}`);
+        lines.push(`- **Total Duration**: ${ms(result.aggregate.totalDurationMs)}`);
+        lines.push('');
+        lines.push('| Task ID | Category | Vote | Coverage | FP Rate | Score | Duration |');
+        lines.push('|---------|----------|:----:|:--------:|:-------:|:-----:|:--------:|');
+        for (const task of result.tasks) {
+            const voteIcon = task.voteAccuracy === 1.0 ? 'PASS' : 'FAIL';
+            lines.push(`| ${task.taskId} | ${task.category} | ${voteIcon} | ${pct(task.detectionCoverage)} | ${pct(task.falsePositiveRate)} | ${pct(task.compositeScore)} | ${ms(task.durationMs)} |`);
+        }
+        lines.push('');
+    }
+    // ── Category Breakdown ───────────────────────────────────────
+    for (const result of results) {
+        const modeLabel = result.mode === 'single-claude' ? 'Single Claude' : 'MAGI 3-Body';
+        lines.push(`## ${modeLabel} — Category Breakdown`);
+        lines.push('');
+        const categories = new Map();
+        for (const task of result.tasks) {
+            const list = categories.get(task.category) ?? [];
+            list.push(task);
+            categories.set(task.category, list);
+        }
+        lines.push('| Category | Tasks | Avg Score | Vote Accuracy |');
+        lines.push('|----------|:-----:|:---------:|:-------------:|');
+        for (const [category, tasks] of categories) {
+            const avgScore = tasks.reduce((a, t) => a + t.compositeScore, 0) / tasks.length;
+            const avgVote = tasks.reduce((a, t) => a + t.voteAccuracy, 0) / tasks.length;
+            lines.push(`| ${category} | ${tasks.length} | ${pct(avgScore)} | ${pct(avgVote)} |`);
+        }
+        lines.push('');
+    }
+    return lines.join('\n');
+}
+// ── Formatting helpers ────────────────────────────────────────
+function pct(value) {
+    return `${(value * 100).toFixed(1)}%`;
+}
+function ms(value) {
+    return `${Math.round(value)}ms`;
+}
+function formatRow(label, singleVal, magiVal, higherIsBetter) {
+    const isScore = label.includes('Score') || label.includes('Accuracy') || label.includes('Coverage');
+    const singleStr = isScore ? pct(singleVal) : ms(singleVal);
+    const magiStr = isScore ? pct(magiVal) : ms(magiVal);
+    const diff = magiVal - singleVal;
+    let deltaStr;
+    if (isScore) {
+        const sign = diff >= 0 ? '+' : '';
+        deltaStr = `${sign}${(diff * 100).toFixed(1)}pp`;
+    }
+    else {
+        const sign = diff >= 0 ? '+' : '';
+        deltaStr = `${sign}${Math.round(diff)}ms`;
+    }
+    // Highlight improvement vs degradation
+    const improved = higherIsBetter ? diff > 0 : diff < 0;
+    if (diff !== 0) {
+        deltaStr = improved ? `**${deltaStr}**` : deltaStr;
+    }
+    return `| ${label} | ${singleStr} | ${magiStr} | ${deltaStr} |`;
+}

package/dist/src/benchmark/runner.d.ts ADDED Viewed

@@ -0,0 +1,30 @@
+import type { BenchmarkResult, BenchmarkOptions } from './types.js';
+export declare class BenchmarkRunner {
+    private readonly options;
+    private readonly tasks;
+    constructor(options?: BenchmarkOptions);
+    /**
+     * Run a single-Claude benchmark (MELCHIOR only, quorum=1, initial-opinion only).
+     */
+    runSingle(): Promise<BenchmarkResult>;
+    /**
+     * Run a MAGI 3-body benchmark (full 3-unit deliberation).
+     */
+    runMagi(): Promise<BenchmarkResult>;
+    /**
+     * Run dry-run benchmark using MockAdapters with perfect responses.
+     */
+    private runDry;
+    /**
+     * Build a fake MagiDeliberation for dry-run scoring validation.
+     */
+    private buildMockDeliberation;
+    /**
+     * Run all benchmarks and generate report.
+     */
+    runAll(): Promise<{
+        single: BenchmarkResult | null;
+        magi: BenchmarkResult | null;
+        report: string;
+    }>;
+}

package/dist/src/benchmark/runner.js ADDED Viewed

@@ -0,0 +1,224 @@
+import { randomUUID } from 'node:crypto';
+import { execFileSync } from 'node:child_process';
+import { join } from 'node:path';
+import { safeMkdir, safeWriteFile } from '../utils/safe-fs.js';
+import { GOLDEN_TASKS } from './golden-tasks.js';
+import { scoreTask, aggregateScores } from './scorer.js';
+import { generateReport } from './reporter.js';
+import { Magi } from '../index.js';
+import { AdapterRegistry } from '../adapters/registry.js';
+import { ClaudeAdapter } from '../adapters/claude.js';
+import { PipelineRegistry } from '../pipelines/registry.js';
+import { CodeReviewPipeline } from '../pipelines/code-review.js';
+import { ArchitecturePipeline } from '../pipelines/architecture.js';
+import { BugAnalysisPipeline } from '../pipelines/bug-analysis.js';
+import { CustomPipeline } from '../pipelines/custom.js';
+import { Orchestrator } from '../engine/orchestrator.js';
+import { MockContextManager } from '../../test/helpers/mock-context-manager.js';
+import { logger } from '../utils/logger.js';
+function getGitHash() {
+    try {
+        return execFileSync('git', ['rev-parse', '--short', 'HEAD'], {
+            encoding: 'utf-8',
+            timeout: 5000,
+        }).trim();
+    }
+    catch (err) {
+        logger.debug('Benchmark: git hash retrieval failed', { error: String(err) });
+        return 'unknown';
+    }
+}
+function buildPipelineRegistry() {
+    const pipelines = new PipelineRegistry();
+    pipelines.register(new CodeReviewPipeline());
+    pipelines.register(new ArchitecturePipeline());
+    pipelines.register(new BugAnalysisPipeline());
+    pipelines.register(new CustomPipeline());
+    return pipelines;
+}
+/**
+ * Create a mock response for a golden task (for dry-run mode).
+ * Returns a "perfect" response that matches the expected outcome.
+ */
+function buildMockResponse(task) {
+    return {
+        vote: task.expectation.expectedVote,
+        confidence: 0.85,
+        reasoning: `Mock analysis: ${task.expectation.rationale}`,
+        keyPoints: task.expectation.mustDetectPoints.map(String),
+    };
+}
+export class BenchmarkRunner {
+    options;
+    tasks;
+    constructor(options = {}) {
+        this.options = options;
+        if (options.category) {
+            this.tasks = GOLDEN_TASKS.filter(t => t.category === options.category);
+        }
+        else {
+            this.tasks = GOLDEN_TASKS;
+        }
+    }
+    /**
+     * Run a single-Claude benchmark (MELCHIOR only, quorum=1, initial-opinion only).
+     */
+    async runSingle() {
+        const runId = randomUUID();
+        const scores = [];
+        if (this.options.dryRun) {
+            return this.runDry('single-claude', runId);
+        }
+        // Single-Claude: Only MELCHIOR adapter, quorum=1, single phase
+        const adapters = new AdapterRegistry();
+        adapters.register(new ClaudeAdapter({ timeoutMs: 120_000, maxRetries: 2 }));
+        const pipelines = buildPipelineRegistry();
+        const context = new MockContextManager();
+        const orchestrator = new Orchestrator({
+            adapters,
+            pipelines,
+            contextManager: context,
+        });
+        for (const goldenTask of this.tasks) {
+            const task = {
+                ...goldenTask.task,
+                config: {
+                    phases: ['initial-opinion'],
+                    consensus: { quorum: 1 },
+                },
+            };
+            const deliberation = await orchestrator.deliberate(task);
+            scores.push(scoreTask(goldenTask, deliberation));
+        }
+        return {
+            runId,
+            timestamp: new Date().toISOString(),
+            gitHash: getGitHash(),
+            mode: 'single-claude',
+            tasks: scores,
+            aggregate: aggregateScores(scores),
+        };
+    }
+    /**
+     * Run a MAGI 3-body benchmark (full 3-unit deliberation).
+     */
+    async runMagi() {
+        const runId = randomUUID();
+        const scores = [];
+        if (this.options.dryRun) {
+            return this.runDry('magi-3-body', runId);
+        }
+        const magi = new Magi({ cacheEnabled: false });
+        for (const goldenTask of this.tasks) {
+            const deliberation = await magi.deliberate(goldenTask.task);
+            scores.push(scoreTask(goldenTask, deliberation));
+        }
+        return {
+            runId,
+            timestamp: new Date().toISOString(),
+            gitHash: getGitHash(),
+            mode: 'magi-3-body',
+            tasks: scores,
+            aggregate: aggregateScores(scores),
+        };
+    }
+    /**
+     * Run dry-run benchmark using MockAdapters with perfect responses.
+     */
+    async runDry(mode, runId) {
+        const scores = [];
+        for (const goldenTask of this.tasks) {
+            const mockResp = buildMockResponse(goldenTask);
+            const deliberation = this.buildMockDeliberation(goldenTask, mockResp, mode);
+            scores.push(scoreTask(goldenTask, deliberation));
+        }
+        return {
+            runId,
+            timestamp: new Date().toISOString(),
+            gitHash: getGitHash(),
+            mode,
+            tasks: scores,
+            aggregate: aggregateScores(scores),
+        };
+    }
+    /**
+     * Build a fake MagiDeliberation for dry-run scoring validation.
+     */
+    buildMockDeliberation(goldenTask, mockResp, mode) {
+        const units = mode === 'single-claude'
+            ? ['MELCHIOR']
+            : ['MELCHIOR', 'BALTHASAR', 'CASPER'];
+        const opinions = units.map(unit => ({
+            unit,
+            vote: mockResp.vote,
+            confidence: mockResp.confidence,
+            reasoning: mockResp.reasoning,
+            keyPoints: mockResp.keyPoints,
+            rawOutput: JSON.stringify(mockResp),
+            meta: {
+                durationMs: 50,
+                exitCode: 0,
+                retryCount: 0,
+                structuredOutput: true,
+            },
+        }));
+        const isApprove = mockResp.vote === 'APPROVE';
+        const decision = mode === 'single-claude'
+            ? (isApprove ? 'UNANIMOUS_APPROVE' : 'UNANIMOUS_REJECT')
+            : (isApprove ? 'UNANIMOUS_APPROVE' : 'UNANIMOUS_REJECT');
+        const now = new Date();
+        return {
+            id: randomUUID(),
+            task: goldenTask.task,
+            rounds: [{
+                    roundNumber: 1,
+                    phase: 'initial-opinion',
+                    opinions,
+                    startedAt: now,
+                    completedAt: now,
+                }],
+            consensus: {
+                decision,
+                method: 'unanimous',
+                votes: {
+                    approve: isApprove ? units : [],
+                    reject: isApprove ? [] : units,
+                    abstain: [],
+                    weightedApprove: isApprove ? units.length : 0,
+                    weightedReject: isApprove ? 0 : units.length,
+                },
+                confidence: mockResp.confidence,
+                summary: `Mock consensus: ${decision}`,
+            },
+            totalDurationMs: 50 * units.length,
+            startedAt: now,
+            completedAt: now,
+        };
+    }
+    /**
+     * Run all benchmarks and generate report.
+     */
+    async runAll() {
+        let single = null;
+        let magi = null;
+        if (!this.options.magiOnly) {
+            single = await this.runSingle();
+        }
+        if (!this.options.singleOnly) {
+            magi = await this.runMagi();
+        }
+        const report = generateReport(single, magi);
+        // Save results
+        const outputDir = this.options.outputDir ?? 'test/benchmark/results';
+        await safeMkdir(outputDir);
+        const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
+        if (single) {
+            await safeWriteFile(join(outputDir, `single-${timestamp}.json`), JSON.stringify(single, null, 2));
+        }
+        if (magi) {
+            await safeWriteFile(join(outputDir, `magi-${timestamp}.json`), JSON.stringify(magi, null, 2));
+        }
+        await safeWriteFile('docs/BENCHMARK_RESULTS.md', report);
+        return { single, magi, report };
+    }
+}

package/dist/src/benchmark/scorer.d.ts ADDED Viewed

@@ -0,0 +1,12 @@
+import type { MagiDeliberation } from '../types/core.js';
+import type { GoldenTask, TaskScore, AggregateScore } from './types.js';
+/**
+ * Score a single golden task against its deliberation result.
+ *
+ * compositeScore = voteAccuracy * 0.4 + detectionCoverage * 0.5 + (1 - falsePositiveRate) * 0.1
+ */
+export declare function scoreTask(goldenTask: GoldenTask, deliberation: MagiDeliberation): TaskScore;
+/**
+ * Aggregate scores across all tasks.
+ */
+export declare function aggregateScores(scores: readonly TaskScore[]): AggregateScore;

package/dist/src/benchmark/scorer.js ADDED Viewed

@@ -0,0 +1,124 @@
+import { getMajorityVote } from '../types/consensus.js';
+/**
+ * Extract the effective vote from a ConsensusDecision.
+ */
+function decisionToVote(decision) {
+    return getMajorityVote(decision);
+}
+/**
+ * Compute detection coverage: fraction of mustDetectPoints found in the output.
+ */
+function computeDetectionCoverage(mustDetectPoints, searchText) {
+    if (mustDetectPoints.length === 0)
+        return 1.0;
+    const lower = searchText.toLowerCase();
+    let found = 0;
+    for (const point of mustDetectPoints) {
+        if (lower.includes(point.toLowerCase())) {
+            found++;
+        }
+    }
+    return found / mustDetectPoints.length;
+}
+/**
+ * Compute false positive rate: fraction of mustNotDetectPoints found in the output.
+ */
+function computeFalsePositiveRate(mustNotDetectPoints, searchText) {
+    if (!mustNotDetectPoints || mustNotDetectPoints.length === 0)
+        return 0.0;
+    const lower = searchText.toLowerCase();
+    let found = 0;
+    for (const point of mustNotDetectPoints) {
+        if (lower.includes(point.toLowerCase())) {
+            found++;
+        }
+    }
+    return found / mustNotDetectPoints.length;
+}
+/**
+ * Build the combined text to search for detection points.
+ * Uses all opinions from the last round's keyPoints + reasoning.
+ */
+function buildSearchText(deliberation) {
+    const lastRound = deliberation.rounds[deliberation.rounds.length - 1];
+    if (!lastRound)
+        return '';
+    const parts = [];
+    for (const opinion of lastRound.opinions) {
+        parts.push(opinion.reasoning);
+        parts.push(...opinion.keyPoints);
+        if (opinion.suggestions) {
+            parts.push(...opinion.suggestions);
+        }
+    }
+    return parts.join(' ');
+}
+/**
+ * Compute the average confidence across all opinions in the last round.
+ */
+function computeAvgConfidence(deliberation) {
+    const lastRound = deliberation.rounds[deliberation.rounds.length - 1];
+    if (!lastRound || lastRound.opinions.length === 0)
+        return 0;
+    const sum = lastRound.opinions.reduce((acc, op) => acc + op.confidence, 0);
+    return sum / lastRound.opinions.length;
+}
+/**
+ * Score a single golden task against its deliberation result.
+ *
+ * compositeScore = voteAccuracy * 0.4 + detectionCoverage * 0.5 + (1 - falsePositiveRate) * 0.1
+ */
+export function scoreTask(goldenTask, deliberation) {
+    const { expectation } = goldenTask;
+    const consensusVote = decisionToVote(deliberation.consensus.decision);
+    const voteAccuracy = consensusVote === expectation.expectedVote ? 1.0 : 0.0;
+    const searchText = buildSearchText(deliberation);
+    const detectionCoverage = computeDetectionCoverage(expectation.mustDetectPoints, searchText);
+    const falsePositiveRate = computeFalsePositiveRate(expectation.mustNotDetectPoints, searchText);
+    const compositeScore = voteAccuracy * 0.4 +
+        detectionCoverage * 0.5 +
+        (1 - falsePositiveRate) * 0.1;
+    return {
+        taskId: goldenTask.id,
+        category: goldenTask.category,
+        weight: expectation.weight,
+        voteAccuracy,
+        detectionCoverage,
+        falsePositiveRate,
+        compositeScore,
+        durationMs: deliberation.totalDurationMs,
+        avgConfidence: computeAvgConfidence(deliberation),
+    };
+}
+/**
+ * Aggregate scores across all tasks.
+ */
+export function aggregateScores(scores) {
+    if (scores.length === 0) {
+        return {
+            totalScore: 0,
+            weightedScore: 0,
+            voteAccuracy: 0,
+            detectionCoverage: 0,
+            avgDurationMs: 0,
+            totalDurationMs: 0,
+        };
+    }
+    const totalWeight = scores.reduce((acc, s) => acc + s.weight, 0);
+    const totalScore = scores.reduce((acc, s) => acc + s.compositeScore, 0) / scores.length;
+    const weightedScore = totalWeight > 0
+        ? scores.reduce((acc, s) => acc + s.compositeScore * s.weight, 0) / totalWeight
+        : 0;
+    const voteAccuracy = scores.reduce((acc, s) => acc + s.voteAccuracy, 0) / scores.length;
+    const detectionCoverage = scores.reduce((acc, s) => acc + s.detectionCoverage, 0) / scores.length;
+    const totalDurationMs = scores.reduce((acc, s) => acc + s.durationMs, 0);
+    const avgDurationMs = totalDurationMs / scores.length;
+    return {
+        totalScore,
+        weightedScore,
+        voteAccuracy,
+        detectionCoverage,
+        avgDurationMs,
+        totalDurationMs,
+    };
+}

package/dist/src/benchmark/types.d.ts ADDED Viewed

@@ -0,0 +1,54 @@
+import type { MagiTask, Vote } from '../types/core.js';
+/** Expected outcome for a golden task */
+export interface GoldenExpectation {
+    expectedVote: Vote;
+    mustDetectPoints: string[];
+    mustNotDetectPoints?: string[];
+    weight: number;
+    rationale: string;
+}
+/** A golden task with known-correct expected outcome */
+export interface GoldenTask {
+    id: string;
+    task: MagiTask;
+    expectation: GoldenExpectation;
+    category: 'code-review' | 'architecture' | 'bug-analysis' | 'security';
+}
+/** Score for a single task evaluation */
+export interface TaskScore {
+    taskId: string;
+    category: string;
+    weight: number;
+    voteAccuracy: number;
+    detectionCoverage: number;
+    falsePositiveRate: number;
+    compositeScore: number;
+    durationMs: number;
+    avgConfidence: number;
+}
+/** Aggregate scores across all tasks */
+export interface AggregateScore {
+    totalScore: number;
+    weightedScore: number;
+    voteAccuracy: number;
+    detectionCoverage: number;
+    avgDurationMs: number;
+    totalDurationMs: number;
+}
+/** Complete benchmark result for one mode */
+export interface BenchmarkResult {
+    runId: string;
+    timestamp: string;
+    gitHash: string;
+    mode: 'single-claude' | 'magi-3-body';
+    tasks: TaskScore[];
+    aggregate: AggregateScore;
+}
+/** Options for the benchmark runner */
+export interface BenchmarkOptions {
+    dryRun?: boolean;
+    singleOnly?: boolean;
+    magiOnly?: boolean;
+    category?: string;
+    outputDir?: string;
+}

package/dist/src/benchmark/types.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};

package/dist/src/cache/deliberation-cache.d.ts ADDED Viewed

@@ -0,0 +1,49 @@
+import type { MagiTask, MagiDeliberation } from '../types/core.js';
+/**
+ * In-memory cache for deliberation results.
+ * Keyed by a SHA-256 hash of task attributes (type + title + description + artifacts).
+ */
+export declare class DeliberationCache {
+    private readonly ttlMs;
+    private readonly maxEntries;
+    private store;
+    private inflight;
+    constructor(ttlMs?: number, // 1 hour default
+    maxEntries?: number);
+    /**
+     * Compute a deterministic cache key from a MagiTask.
+     */
+    computeKey(task: MagiTask): string;
+    /**
+     * Get a cached deliberation result.
+     * Returns null on miss or TTL expiry.
+     * Restores Date objects and sets fromCache=true.
+     */
+    get(task: MagiTask): MagiDeliberation | null;
+    /**
+     * Store a deliberation result in the cache.
+     */
+    set(task: MagiTask, deliberation: MagiDeliberation): void;
+    /**
+     * Get from cache or compute. Prevents cache stampede by coalescing
+     * concurrent requests for the same cache key.
+     */
+    getOrCompute(task: MagiTask, compute: () => Promise<MagiDeliberation>): Promise<MagiDeliberation>;
+    /**
+     * Remove all entries from the cache.
+     */
+    clear(): void;
+    /**
+     * Remove expired entries.
+     */
+    prune(): number;
+    /**
+     * Cache statistics.
+     */
+    stats(): {
+        size: number;
+        ttlMs: number;
+        maxEntries: number;
+        inflightCount: number;
+    };
+}