npm - @mainahq/core - Versions diffs - 0.2.0 - Mend

@mainahq/core 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (156) hide show

package/README.md +31 -0
package/package.json +37 -0
package/src/ai/__tests__/ai.test.ts +207 -0
package/src/ai/__tests__/design-approaches.test.ts +192 -0
package/src/ai/__tests__/spec-questions.test.ts +191 -0
package/src/ai/__tests__/tiers.test.ts +110 -0
package/src/ai/commit-msg.ts +28 -0
package/src/ai/design-approaches.ts +76 -0
package/src/ai/index.ts +205 -0
package/src/ai/pr-summary.ts +60 -0
package/src/ai/spec-questions.ts +74 -0
package/src/ai/tiers.ts +52 -0
package/src/ai/try-generate.ts +89 -0
package/src/ai/validate.ts +66 -0
package/src/benchmark/__tests__/reporter.test.ts +525 -0
package/src/benchmark/__tests__/runner.test.ts +113 -0
package/src/benchmark/__tests__/story-loader.test.ts +152 -0
package/src/benchmark/reporter.ts +332 -0
package/src/benchmark/runner.ts +91 -0
package/src/benchmark/story-loader.ts +88 -0
package/src/benchmark/types.ts +95 -0
package/src/cache/__tests__/keys.test.ts +97 -0
package/src/cache/__tests__/manager.test.ts +312 -0
package/src/cache/__tests__/ttl.test.ts +94 -0
package/src/cache/keys.ts +44 -0
package/src/cache/manager.ts +231 -0
package/src/cache/ttl.ts +77 -0
package/src/config/__tests__/config.test.ts +376 -0
package/src/config/index.ts +198 -0
package/src/context/__tests__/budget.test.ts +179 -0
package/src/context/__tests__/engine.test.ts +163 -0
package/src/context/__tests__/episodic.test.ts +291 -0
package/src/context/__tests__/relevance.test.ts +323 -0
package/src/context/__tests__/retrieval.test.ts +143 -0
package/src/context/__tests__/selector.test.ts +174 -0
package/src/context/__tests__/semantic.test.ts +252 -0
package/src/context/__tests__/treesitter.test.ts +229 -0
package/src/context/__tests__/working.test.ts +236 -0
package/src/context/budget.ts +130 -0
package/src/context/engine.ts +394 -0
package/src/context/episodic.ts +251 -0
package/src/context/relevance.ts +325 -0
package/src/context/retrieval.ts +325 -0
package/src/context/selector.ts +93 -0
package/src/context/semantic.ts +331 -0
package/src/context/treesitter.ts +216 -0
package/src/context/working.ts +192 -0
package/src/db/__tests__/db.test.ts +151 -0
package/src/db/index.ts +211 -0
package/src/db/schema.ts +84 -0
package/src/design/__tests__/design.test.ts +310 -0
package/src/design/__tests__/generate-hld-lld.test.ts +109 -0
package/src/design/__tests__/review.test.ts +561 -0
package/src/design/index.ts +297 -0
package/src/design/review.ts +327 -0
package/src/explain/__tests__/explain.test.ts +173 -0
package/src/explain/index.ts +181 -0
package/src/features/__tests__/analyzer.test.ts +358 -0
package/src/features/__tests__/checklist.test.ts +454 -0
package/src/features/__tests__/numbering.test.ts +319 -0
package/src/features/__tests__/quality.test.ts +295 -0
package/src/features/__tests__/traceability.test.ts +147 -0
package/src/features/analyzer.ts +445 -0
package/src/features/checklist.ts +366 -0
package/src/features/index.ts +18 -0
package/src/features/numbering.ts +404 -0
package/src/features/quality.ts +349 -0
package/src/features/test-stubs.ts +157 -0
package/src/features/traceability.ts +260 -0
package/src/feedback/__tests__/async-feedback.test.ts +52 -0
package/src/feedback/__tests__/collector.test.ts +219 -0
package/src/feedback/__tests__/compress.test.ts +150 -0
package/src/feedback/__tests__/preferences.test.ts +169 -0
package/src/feedback/collector.ts +135 -0
package/src/feedback/compress.ts +92 -0
package/src/feedback/preferences.ts +108 -0
package/src/git/__tests__/git.test.ts +62 -0
package/src/git/index.ts +110 -0
package/src/hooks/__tests__/runner.test.ts +266 -0
package/src/hooks/index.ts +8 -0
package/src/hooks/runner.ts +130 -0
package/src/index.ts +356 -0
package/src/init/__tests__/init.test.ts +228 -0
package/src/init/index.ts +364 -0
package/src/language/__tests__/detect.test.ts +77 -0
package/src/language/__tests__/profile.test.ts +51 -0
package/src/language/detect.ts +70 -0
package/src/language/profile.ts +110 -0
package/src/prompts/__tests__/defaults.test.ts +52 -0
package/src/prompts/__tests__/engine.test.ts +183 -0
package/src/prompts/__tests__/evolution-resolve.test.ts +169 -0
package/src/prompts/__tests__/evolution.test.ts +187 -0
package/src/prompts/__tests__/loader.test.ts +105 -0
package/src/prompts/candidates/review-v2.md +55 -0
package/src/prompts/defaults/ai-review.md +49 -0
package/src/prompts/defaults/commit.md +30 -0
package/src/prompts/defaults/context.md +26 -0
package/src/prompts/defaults/design-approaches.md +57 -0
package/src/prompts/defaults/design-hld-lld.md +55 -0
package/src/prompts/defaults/design.md +53 -0
package/src/prompts/defaults/explain.md +31 -0
package/src/prompts/defaults/fix.md +32 -0
package/src/prompts/defaults/index.ts +38 -0
package/src/prompts/defaults/review.md +41 -0
package/src/prompts/defaults/spec-questions.md +59 -0
package/src/prompts/defaults/tests.md +72 -0
package/src/prompts/engine.ts +137 -0
package/src/prompts/evolution.ts +409 -0
package/src/prompts/loader.ts +71 -0
package/src/review/__tests__/review.test.ts +288 -0
package/src/review/comprehensive.ts +362 -0
package/src/review/index.ts +417 -0
package/src/stats/__tests__/tracker.test.ts +323 -0
package/src/stats/index.ts +11 -0
package/src/stats/tracker.ts +492 -0
package/src/ticket/__tests__/ticket.test.ts +273 -0
package/src/ticket/index.ts +185 -0
package/src/utils.ts +87 -0
package/src/verify/__tests__/ai-review.test.ts +242 -0
package/src/verify/__tests__/coverage.test.ts +83 -0
package/src/verify/__tests__/detect.test.ts +175 -0
package/src/verify/__tests__/diff-filter.test.ts +338 -0
package/src/verify/__tests__/fix.test.ts +478 -0
package/src/verify/__tests__/linters/clippy.test.ts +45 -0
package/src/verify/__tests__/linters/go-vet.test.ts +27 -0
package/src/verify/__tests__/linters/ruff.test.ts +64 -0
package/src/verify/__tests__/mutation.test.ts +141 -0
package/src/verify/__tests__/pipeline.test.ts +553 -0
package/src/verify/__tests__/proof.test.ts +97 -0
package/src/verify/__tests__/secretlint.test.ts +190 -0
package/src/verify/__tests__/semgrep.test.ts +217 -0
package/src/verify/__tests__/slop.test.ts +366 -0
package/src/verify/__tests__/sonar.test.ts +113 -0
package/src/verify/__tests__/syntax-guard.test.ts +227 -0
package/src/verify/__tests__/trivy.test.ts +191 -0
package/src/verify/__tests__/visual.test.ts +139 -0
package/src/verify/ai-review.ts +276 -0
package/src/verify/coverage.ts +134 -0
package/src/verify/detect.ts +171 -0
package/src/verify/diff-filter.ts +183 -0
package/src/verify/fix.ts +317 -0
package/src/verify/linters/clippy.ts +52 -0
package/src/verify/linters/go-vet.ts +32 -0
package/src/verify/linters/ruff.ts +47 -0
package/src/verify/mutation.ts +143 -0
package/src/verify/pipeline.ts +328 -0
package/src/verify/proof.ts +277 -0
package/src/verify/secretlint.ts +168 -0
package/src/verify/semgrep.ts +170 -0
package/src/verify/slop.ts +493 -0
package/src/verify/sonar.ts +146 -0
package/src/verify/syntax-guard.ts +251 -0
package/src/verify/trivy.ts +161 -0
package/src/verify/visual.ts +460 -0
package/src/workflow/__tests__/context.test.ts +110 -0
package/src/workflow/context.ts +81 -0

package/src/benchmark/__tests__/story-loader.test.ts ADDED Viewed

@@ -0,0 +1,152 @@
+import { afterEach, beforeEach, describe, expect, test } from "bun:test";
+import { mkdirSync, rmSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+import { listStories, loadStory } from "../story-loader";
+let tmpDir: string;
+beforeEach(() => {
+	tmpDir = join(
+		import.meta.dir,
+		`tmp-stories-${Date.now()}-${Math.random().toString(36).slice(2)}`,
+	);
+	mkdirSync(tmpDir, { recursive: true });
+});
+afterEach(() => {
+	try {
+		rmSync(tmpDir, { recursive: true, force: true });
+	} catch {
+		// ignore
+	}
+});
+function createStory(
+	name: string,
+	config: Record<string, unknown>,
+	spec = "# Test Spec\n",
+	testContent = 'test("stub", () => {});',
+) {
+	const storyDir = join(tmpDir, name);
+	mkdirSync(join(storyDir, "tests"), { recursive: true });
+	writeFileSync(join(storyDir, "story.json"), JSON.stringify(config));
+	writeFileSync(join(storyDir, "spec.md"), spec);
+	writeFileSync(join(storyDir, "tests", "test.ts"), testContent);
+}
+describe("listStories", () => {
+	test("returns empty array when stories directory does not exist", () => {
+		const result = listStories(join(tmpDir, "nonexistent"));
+		expect(result.ok).toBe(true);
+		if (result.ok) {
+			expect(result.value).toEqual([]);
+		}
+	});
+	test("lists stories with valid story.json", () => {
+		createStory("mitt", {
+			name: "mitt",
+			description: "Event emitter",
+			tier: 1,
+			source: "https://github.com/developit/mitt",
+			testFiles: ["tests/test.ts"],
+			metrics: { expectedTests: 18, originalLOC: 80, complexity: "easy" },
+		});
+		createStory("ms", {
+			name: "ms",
+			description: "Time converter",
+			tier: 2,
+			source: "https://github.com/vercel/ms",
+			testFiles: ["tests/test.ts"],
+			metrics: { expectedTests: 50, originalLOC: 200, complexity: "medium" },
+		});
+		const result = listStories(tmpDir);
+		expect(result.ok).toBe(true);
+		if (result.ok) {
+			expect(result.value).toHaveLength(2);
+			expect(result.value.map((s) => s.name).sort()).toEqual(["mitt", "ms"]);
+		}
+	});
+	test("skips directories without story.json", () => {
+		createStory("valid", {
+			name: "valid",
+			description: "V",
+			tier: 1,
+			source: "s",
+			testFiles: ["tests/test.ts"],
+			metrics: { expectedTests: 1, originalLOC: 10, complexity: "easy" },
+		});
+		mkdirSync(join(tmpDir, "no-config"), { recursive: true });
+		const result = listStories(tmpDir);
+		expect(result.ok).toBe(true);
+		if (result.ok) {
+			expect(result.value).toHaveLength(1);
+			expect(result.value[0]?.name).toBe("valid");
+		}
+	});
+});
+describe("loadStory", () => {
+	test("loads a valid story with config, spec, and tests", () => {
+		createStory(
+			"mitt",
+			{
+				name: "mitt",
+				description: "Event emitter",
+				tier: 1,
+				source: "https://github.com/developit/mitt",
+				testFiles: ["tests/test.ts"],
+				metrics: { expectedTests: 18, originalLOC: 80, complexity: "easy" },
+			},
+			"# Mitt Spec\nRequirements here.",
+			'import { test } from "bun:test";\ntest("foo", () => {});',
+		);
+		const result = loadStory(tmpDir, "mitt");
+		expect(result.ok).toBe(true);
+		if (result.ok) {
+			expect(result.value.config.name).toBe("mitt");
+			expect(result.value.specContent).toContain("# Mitt Spec");
+			expect(result.value.testFiles).toHaveLength(1);
+			expect(result.value.testFiles[0]?.content).toContain('test("foo"');
+		}
+	});
+	test("returns error for nonexistent story", () => {
+		const result = loadStory(tmpDir, "nonexistent");
+		expect(result.ok).toBe(false);
+	});
+	test("returns error for missing spec.md", () => {
+		const storyDir = join(tmpDir, "bad");
+		mkdirSync(storyDir, { recursive: true });
+		writeFileSync(
+			join(storyDir, "story.json"),
+			JSON.stringify({
+				name: "bad",
+				description: "B",
+				tier: 1,
+				source: "s",
+				testFiles: [],
+				metrics: { expectedTests: 0, originalLOC: 0, complexity: "easy" },
+			}),
+		);
+		const result = loadStory(tmpDir, "bad");
+		expect(result.ok).toBe(false);
+	});
+	test("returns error for invalid story.json", () => {
+		const storyDir = join(tmpDir, "invalid");
+		mkdirSync(storyDir, { recursive: true });
+		writeFileSync(join(storyDir, "story.json"), "not json");
+		writeFileSync(join(storyDir, "spec.md"), "# Spec\n");
+		const result = loadStory(tmpDir, "invalid");
+		expect(result.ok).toBe(false);
+	});
+});

package/src/benchmark/reporter.ts ADDED Viewed

@@ -0,0 +1,332 @@
+import type {
+	BenchmarkMetrics,
+	BenchmarkReport,
+	StepMetrics,
+	StoryConfig,
+	Tier3Results,
+	Tier3Totals,
+} from "./types";
+/**
+ * Build a comparison report from two pipeline runs.
+ */
+export function buildReport(
+	story: StoryConfig,
+	maina: BenchmarkMetrics | null,
+	speckit: BenchmarkMetrics | null,
+): BenchmarkReport {
+	let winner: BenchmarkReport["winner"] = "incomplete";
+	if (maina && speckit) {
+		if (maina.testsPassed > speckit.testsPassed) {
+			winner = "maina";
+		} else if (speckit.testsPassed > maina.testsPassed) {
+			winner = "speckit";
+		} else {
+			winner = "tie";
+		}
+	}
+	return {
+		story,
+		maina,
+		speckit,
+		timestamp: new Date().toISOString(),
+		winner,
+	};
+}
+function metricValue(
+	metrics: BenchmarkMetrics | null,
+	key: keyof BenchmarkMetrics,
+): string {
+	if (!metrics) return "—";
+	const val = metrics[key];
+	if (typeof val === "number") return String(val);
+	return String(val);
+}
+/**
+ * Format a comparison report as a readable terminal table.
+ */
+export function formatComparison(report: BenchmarkReport): string {
+	const rows: Array<[string, string, string]> = [
+		["Metric", "maina", "speckit"],
+		["─".repeat(24), "─".repeat(12), "─".repeat(12)],
+		[
+			"Tests Passed",
+			metricValue(report.maina, "testsPassed"),
+			metricValue(report.speckit, "testsPassed"),
+		],
+		[
+			"Tests Failed",
+			metricValue(report.maina, "testsFailed"),
+			metricValue(report.speckit, "testsFailed"),
+		],
+		[
+			"Tests Total",
+			metricValue(report.maina, "testsTotal"),
+			metricValue(report.speckit, "testsTotal"),
+		],
+		[
+			"Wall Clock (ms)",
+			metricValue(report.maina, "wallClockMs"),
+			metricValue(report.speckit, "wallClockMs"),
+		],
+		[
+			"Tokens In",
+			metricValue(report.maina, "tokensInput"),
+			metricValue(report.speckit, "tokensInput"),
+		],
+		[
+			"Tokens Out",
+			metricValue(report.maina, "tokensOutput"),
+			metricValue(report.speckit, "tokensOutput"),
+		],
+		[
+			"Verify Findings",
+			metricValue(report.maina, "verifyFindings"),
+			metricValue(report.speckit, "verifyFindings"),
+		],
+		[
+			"Spec Quality",
+			metricValue(report.maina, "specQualityScore"),
+			metricValue(report.speckit, "specQualityScore"),
+		],
+		[
+			"Impl LOC",
+			metricValue(report.maina, "implLOC"),
+			metricValue(report.speckit, "implLOC"),
+		],
+		[
+			"Attempts to Pass",
+			metricValue(report.maina, "attemptsToPass"),
+			metricValue(report.speckit, "attemptsToPass"),
+		],
+		[
+			"Bugs Introduced",
+			metricValue(report.maina, "bugsIntroduced"),
+			metricValue(report.speckit, "bugsIntroduced"),
+		],
+	];
+	const lines = [
+		`\n## Benchmark: ${report.story.name} (tier ${report.story.tier})\n`,
+	];
+	for (const [label, m, s] of rows) {
+		lines.push(`  ${label.padEnd(24)} ${m.padStart(12)} ${s.padStart(12)}`);
+	}
+	lines.push("");
+	lines.push(`  Winner: ${report.winner}`);
+	lines.push("");
+	return lines.join("\n");
+}
+/**
+ * Compute totals from a record of per-step metrics plus bug/test metadata.
+ */
+function computeTotals(
+	steps: Record<string, StepMetrics>,
+	meta: {
+		bugsIntroduced: number;
+		bugsCaught: number;
+		testsPassed: number;
+		testsTotal: number;
+	},
+): Tier3Totals {
+	let durationMs = 0;
+	let tokensInput = 0;
+	let tokensOutput = 0;
+	for (const step of Object.values(steps)) {
+		durationMs += step.durationMs;
+		tokensInput += step.tokensInput;
+		tokensOutput += step.tokensOutput;
+	}
+	return {
+		durationMs,
+		tokensInput,
+		tokensOutput,
+		bugsIntroduced: meta.bugsIntroduced,
+		bugsCaught: meta.bugsCaught,
+		testsPassed: meta.testsPassed,
+		testsTotal: meta.testsTotal,
+	};
+}
+/**
+ * Determine winner for tier 3 based on:
+ * 1. Test pass rate (higher wins)
+ * 2. Bugs caught (higher wins)
+ * 3. Duration (lower wins)
+ */
+function determineTier3Winner(
+	maina: Tier3Totals,
+	speckit: Tier3Totals,
+): Tier3Results["winner"] {
+	const mainaPassRate =
+		maina.testsTotal > 0 ? maina.testsPassed / maina.testsTotal : 0;
+	const speckitPassRate =
+		speckit.testsTotal > 0 ? speckit.testsPassed / speckit.testsTotal : 0;
+	if (mainaPassRate !== speckitPassRate) {
+		return mainaPassRate > speckitPassRate ? "maina" : "speckit";
+	}
+	if (maina.bugsCaught !== speckit.bugsCaught) {
+		return maina.bugsCaught > speckit.bugsCaught ? "maina" : "speckit";
+	}
+	if (maina.durationMs !== speckit.durationMs) {
+		return maina.durationMs < speckit.durationMs ? "maina" : "speckit";
+	}
+	return "tie";
+}
+/**
+ * Build a tier 3 report from per-step metrics for both pipelines.
+ */
+export function buildTier3Report(
+	story: StoryConfig,
+	mainaSteps: Record<string, StepMetrics>,
+	speckitSteps: Record<string, StepMetrics>,
+	learnings: string[],
+	meta?: {
+		maina: {
+			bugsIntroduced: number;
+			bugsCaught: number;
+			testsPassed: number;
+			testsTotal: number;
+		};
+		speckit: {
+			bugsIntroduced: number;
+			bugsCaught: number;
+			testsPassed: number;
+			testsTotal: number;
+		};
+	},
+): Tier3Results {
+	const mainaMeta = meta?.maina ?? {
+		bugsIntroduced: 0,
+		bugsCaught: 0,
+		testsPassed: 0,
+		testsTotal: 0,
+	};
+	const speckitMeta = meta?.speckit ?? {
+		bugsIntroduced: 0,
+		bugsCaught: 0,
+		testsPassed: 0,
+		testsTotal: 0,
+	};
+	const mainaTotals = computeTotals(mainaSteps, mainaMeta);
+	const speckitTotals = computeTotals(speckitSteps, speckitMeta);
+	const hasMainaSteps = Object.keys(mainaSteps).length > 0;
+	const hasSpeckitSteps = Object.keys(speckitSteps).length > 0;
+	const winner =
+		hasMainaSteps && hasSpeckitSteps
+			? determineTier3Winner(mainaTotals, speckitTotals)
+			: "incomplete";
+	return {
+		story,
+		timestamp: new Date().toISOString(),
+		maina: { steps: mainaSteps, totals: mainaTotals },
+		speckit: { steps: speckitSteps, totals: speckitTotals },
+		winner,
+		learnings,
+	};
+}
+/**
+ * Format a tier 3 comparison report as a readable terminal table
+ * with per-step breakdown.
+ */
+export function formatTier3Comparison(results: Tier3Results): string {
+	const allStepKeys = new Set<string>([
+		...Object.keys(results.maina.steps),
+		...Object.keys(results.speckit.steps),
+	]);
+	const header: [string, string, string, string, string] = [
+		"Step",
+		"Maina (ms)",
+		"Maina (tokens)",
+		"SpecKit (ms)",
+		"SpecKit (tokens)",
+	];
+	const separator: [string, string, string, string, string] = [
+		"─".repeat(24),
+		"─".repeat(14),
+		"─".repeat(16),
+		"─".repeat(14),
+		"─".repeat(16),
+	];
+	const rows: Array<[string, string, string, string, string]> = [
+		header,
+		separator,
+	];
+	for (const key of allStepKeys) {
+		const ms = results.maina.steps[key];
+		const ss = results.speckit.steps[key];
+		rows.push([
+			ms?.name ?? ss?.name ?? key,
+			ms ? String(ms.durationMs) : "—",
+			ms ? String(ms.tokensInput + ms.tokensOutput) : "—",
+			ss ? String(ss.durationMs) : "—",
+			ss ? String(ss.tokensInput + ss.tokensOutput) : "—",
+		]);
+	}
+	// Totals row
+	const mt = results.maina.totals;
+	const st = results.speckit.totals;
+	rows.push(separator);
+	rows.push([
+		"TOTAL",
+		String(mt.durationMs),
+		String(mt.tokensInput + mt.tokensOutput),
+		String(st.durationMs),
+		String(st.tokensInput + st.tokensOutput),
+	]);
+	const lines = [`\n## Tier 3 Benchmark: ${results.story.name}\n`];
+	for (const [step, mMs, mTok, sMs, sTok] of rows) {
+		lines.push(
+			`  ${step.padEnd(24)} ${mMs.padStart(14)} ${mTok.padStart(16)} ${sMs.padStart(14)} ${sTok.padStart(16)}`,
+		);
+	}
+	// Findings/bugs summary
+	lines.push("");
+	lines.push("  Findings / Bugs:");
+	lines.push(
+		`    Maina  — bugs introduced: ${mt.bugsIntroduced}, bugs caught: ${mt.bugsCaught}, tests: ${mt.testsPassed}/${mt.testsTotal}`,
+	);
+	lines.push(
+		`    SpecKit — bugs introduced: ${st.bugsIntroduced}, bugs caught: ${st.bugsCaught}, tests: ${st.testsPassed}/${st.testsTotal}`,
+	);
+	lines.push("");
+	lines.push(`  Winner: ${results.winner}`);
+	if (results.learnings.length > 0) {
+		lines.push("");
+		lines.push("  Learnings:");
+		for (const learning of results.learnings) {
+			lines.push(`    - ${learning}`);
+		}
+	}
+	lines.push("");
+	return lines.join("\n");
+}

package/src/benchmark/runner.ts ADDED Viewed

@@ -0,0 +1,91 @@
+import type { Result } from "../db/index";
+import type { BenchmarkMetrics } from "./types";
+export interface TestResult {
+	passed: number;
+	failed: number;
+	total: number;
+}
+export interface RunBenchmarkOptions {
+	pipeline: "maina" | "speckit";
+	storyName: string;
+	testFiles: string[];
+	implDir: string;
+	tokensInput?: number;
+	tokensOutput?: number;
+	verifyFindings?: number;
+	specQualityScore?: number;
+	implLOC?: number;
+	attemptsToPass?: number;
+	bugsIntroduced?: number;
+	toolsUsed?: string[];
+}
+/**
+ * Parse bun test stdout to extract pass/fail counts.
+ */
+export function parseTestOutput(output: string): TestResult {
+	const passMatch = output.match(/(\d+)\s+pass/);
+	const failMatch = output.match(/(\d+)\s+fail/);
+	const passed = passMatch ? Number.parseInt(passMatch[1] as string, 10) : 0;
+	const failed = failMatch ? Number.parseInt(failMatch[1] as string, 10) : 0;
+	return { passed, failed, total: passed + failed };
+}
+/**
+ * Run benchmark tests against an implementation directory.
+ * Spawns `bun test` on the provided test files and captures metrics.
+ */
+export async function runBenchmark(
+	options: RunBenchmarkOptions,
+): Promise<Result<BenchmarkMetrics>> {
+	const startMs = performance.now();
+	try {
+		const proc = Bun.spawn(["bun", "test", ...options.testFiles], {
+			cwd: options.implDir,
+			stdout: "pipe",
+			stderr: "pipe",
+			env: {
+				...process.env,
+				MITT_IMPL_PATH: options.implDir,
+			},
+		});
+		const stdout = await new Response(proc.stdout).text();
+		const stderr = await new Response(proc.stderr).text();
+		await proc.exited;
+		const combined = stdout + stderr;
+		const testResult = parseTestOutput(combined);
+		const wallClockMs = Math.round(performance.now() - startMs);
+		return {
+			ok: true,
+			value: {
+				pipeline: options.pipeline,
+				storyName: options.storyName,
+				wallClockMs,
+				tokensInput: options.tokensInput ?? 0,
+				tokensOutput: options.tokensOutput ?? 0,
+				testsTotal: testResult.total,
+				testsPassed: testResult.passed,
+				testsFailed: testResult.failed,
+				verifyFindings: options.verifyFindings ?? 0,
+				specQualityScore: options.specQualityScore ?? 0,
+				implLOC: options.implLOC ?? 0,
+				attemptsToPass: options.attemptsToPass ?? 1,
+				bugsIntroduced: options.bugsIntroduced ?? 0,
+				toolsUsed: options.toolsUsed ?? [],
+			},
+		};
+	} catch (e) {
+		return {
+			ok: false,
+			error: `Benchmark run failed: ${e instanceof Error ? e.message : String(e)}`,
+		};
+	}
+}

package/src/benchmark/story-loader.ts ADDED Viewed

@@ -0,0 +1,88 @@
+import { existsSync, readdirSync, readFileSync } from "node:fs";
+import { join } from "node:path";
+import type { Result } from "../db/index";
+import type { LoadedStory, StoryConfig } from "./types";
+/**
+ * List all available benchmark stories in the given directory.
+ * Each story must have a valid story.json to be included.
+ */
+export function listStories(storiesDir: string): Result<StoryConfig[]> {
+	if (!existsSync(storiesDir)) {
+		return { ok: true, value: [] };
+	}
+	const entries = readdirSync(storiesDir, { withFileTypes: true });
+	const stories: StoryConfig[] = [];
+	for (const entry of entries) {
+		if (!entry.isDirectory()) continue;
+		const configPath = join(storiesDir, entry.name, "story.json");
+		if (!existsSync(configPath)) continue;
+		try {
+			const raw = readFileSync(configPath, "utf-8");
+			const config = JSON.parse(raw) as StoryConfig;
+			if (config.name && config.description) {
+				stories.push(config);
+			}
+		} catch {
+			// Skip invalid configs
+		}
+	}
+	return { ok: true, value: stories };
+}
+/**
+ * Load a specific story by name, including its config, spec, and test files.
+ */
+export function loadStory(
+	storiesDir: string,
+	name: string,
+): Result<LoadedStory> {
+	const storyDir = join(storiesDir, name);
+	if (!existsSync(storyDir)) {
+		return { ok: false, error: `Story not found: ${name}` };
+	}
+	// Load config
+	const configPath = join(storyDir, "story.json");
+	if (!existsSync(configPath)) {
+		return { ok: false, error: `Missing story.json in ${name}` };
+	}
+	let config: StoryConfig;
+	try {
+		const raw = readFileSync(configPath, "utf-8");
+		config = JSON.parse(raw) as StoryConfig;
+	} catch {
+		return { ok: false, error: `Invalid story.json in ${name}` };
+	}
+	// Load spec
+	const specPath = join(storyDir, "spec.md");
+	if (!existsSync(specPath)) {
+		return { ok: false, error: `Missing spec.md in ${name}` };
+	}
+	const specContent = readFileSync(specPath, "utf-8");
+	// Load test files
+	const testFiles: Array<{ name: string; content: string }> = [];
+	for (const testFile of config.testFiles) {
+		const testPath = join(storyDir, testFile);
+		if (existsSync(testPath)) {
+			testFiles.push({
+				name: testFile,
+				content: readFileSync(testPath, "utf-8"),
+			});
+		}
+	}
+	return {
+		ok: true,
+		value: { config, specContent, testFiles, storyDir },
+	};
+}