@mainahq/core 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. package/README.md +31 -0
  2. package/package.json +37 -0
  3. package/src/ai/__tests__/ai.test.ts +207 -0
  4. package/src/ai/__tests__/design-approaches.test.ts +192 -0
  5. package/src/ai/__tests__/spec-questions.test.ts +191 -0
  6. package/src/ai/__tests__/tiers.test.ts +110 -0
  7. package/src/ai/commit-msg.ts +28 -0
  8. package/src/ai/design-approaches.ts +76 -0
  9. package/src/ai/index.ts +205 -0
  10. package/src/ai/pr-summary.ts +60 -0
  11. package/src/ai/spec-questions.ts +74 -0
  12. package/src/ai/tiers.ts +52 -0
  13. package/src/ai/try-generate.ts +89 -0
  14. package/src/ai/validate.ts +66 -0
  15. package/src/benchmark/__tests__/reporter.test.ts +525 -0
  16. package/src/benchmark/__tests__/runner.test.ts +113 -0
  17. package/src/benchmark/__tests__/story-loader.test.ts +152 -0
  18. package/src/benchmark/reporter.ts +332 -0
  19. package/src/benchmark/runner.ts +91 -0
  20. package/src/benchmark/story-loader.ts +88 -0
  21. package/src/benchmark/types.ts +95 -0
  22. package/src/cache/__tests__/keys.test.ts +97 -0
  23. package/src/cache/__tests__/manager.test.ts +312 -0
  24. package/src/cache/__tests__/ttl.test.ts +94 -0
  25. package/src/cache/keys.ts +44 -0
  26. package/src/cache/manager.ts +231 -0
  27. package/src/cache/ttl.ts +77 -0
  28. package/src/config/__tests__/config.test.ts +376 -0
  29. package/src/config/index.ts +198 -0
  30. package/src/context/__tests__/budget.test.ts +179 -0
  31. package/src/context/__tests__/engine.test.ts +163 -0
  32. package/src/context/__tests__/episodic.test.ts +291 -0
  33. package/src/context/__tests__/relevance.test.ts +323 -0
  34. package/src/context/__tests__/retrieval.test.ts +143 -0
  35. package/src/context/__tests__/selector.test.ts +174 -0
  36. package/src/context/__tests__/semantic.test.ts +252 -0
  37. package/src/context/__tests__/treesitter.test.ts +229 -0
  38. package/src/context/__tests__/working.test.ts +236 -0
  39. package/src/context/budget.ts +130 -0
  40. package/src/context/engine.ts +394 -0
  41. package/src/context/episodic.ts +251 -0
  42. package/src/context/relevance.ts +325 -0
  43. package/src/context/retrieval.ts +325 -0
  44. package/src/context/selector.ts +93 -0
  45. package/src/context/semantic.ts +331 -0
  46. package/src/context/treesitter.ts +216 -0
  47. package/src/context/working.ts +192 -0
  48. package/src/db/__tests__/db.test.ts +151 -0
  49. package/src/db/index.ts +211 -0
  50. package/src/db/schema.ts +84 -0
  51. package/src/design/__tests__/design.test.ts +310 -0
  52. package/src/design/__tests__/generate-hld-lld.test.ts +109 -0
  53. package/src/design/__tests__/review.test.ts +561 -0
  54. package/src/design/index.ts +297 -0
  55. package/src/design/review.ts +327 -0
  56. package/src/explain/__tests__/explain.test.ts +173 -0
  57. package/src/explain/index.ts +181 -0
  58. package/src/features/__tests__/analyzer.test.ts +358 -0
  59. package/src/features/__tests__/checklist.test.ts +454 -0
  60. package/src/features/__tests__/numbering.test.ts +319 -0
  61. package/src/features/__tests__/quality.test.ts +295 -0
  62. package/src/features/__tests__/traceability.test.ts +147 -0
  63. package/src/features/analyzer.ts +445 -0
  64. package/src/features/checklist.ts +366 -0
  65. package/src/features/index.ts +18 -0
  66. package/src/features/numbering.ts +404 -0
  67. package/src/features/quality.ts +349 -0
  68. package/src/features/test-stubs.ts +157 -0
  69. package/src/features/traceability.ts +260 -0
  70. package/src/feedback/__tests__/async-feedback.test.ts +52 -0
  71. package/src/feedback/__tests__/collector.test.ts +219 -0
  72. package/src/feedback/__tests__/compress.test.ts +150 -0
  73. package/src/feedback/__tests__/preferences.test.ts +169 -0
  74. package/src/feedback/collector.ts +135 -0
  75. package/src/feedback/compress.ts +92 -0
  76. package/src/feedback/preferences.ts +108 -0
  77. package/src/git/__tests__/git.test.ts +62 -0
  78. package/src/git/index.ts +110 -0
  79. package/src/hooks/__tests__/runner.test.ts +266 -0
  80. package/src/hooks/index.ts +8 -0
  81. package/src/hooks/runner.ts +130 -0
  82. package/src/index.ts +356 -0
  83. package/src/init/__tests__/init.test.ts +228 -0
  84. package/src/init/index.ts +364 -0
  85. package/src/language/__tests__/detect.test.ts +77 -0
  86. package/src/language/__tests__/profile.test.ts +51 -0
  87. package/src/language/detect.ts +70 -0
  88. package/src/language/profile.ts +110 -0
  89. package/src/prompts/__tests__/defaults.test.ts +52 -0
  90. package/src/prompts/__tests__/engine.test.ts +183 -0
  91. package/src/prompts/__tests__/evolution-resolve.test.ts +169 -0
  92. package/src/prompts/__tests__/evolution.test.ts +187 -0
  93. package/src/prompts/__tests__/loader.test.ts +105 -0
  94. package/src/prompts/candidates/review-v2.md +55 -0
  95. package/src/prompts/defaults/ai-review.md +49 -0
  96. package/src/prompts/defaults/commit.md +30 -0
  97. package/src/prompts/defaults/context.md +26 -0
  98. package/src/prompts/defaults/design-approaches.md +57 -0
  99. package/src/prompts/defaults/design-hld-lld.md +55 -0
  100. package/src/prompts/defaults/design.md +53 -0
  101. package/src/prompts/defaults/explain.md +31 -0
  102. package/src/prompts/defaults/fix.md +32 -0
  103. package/src/prompts/defaults/index.ts +38 -0
  104. package/src/prompts/defaults/review.md +41 -0
  105. package/src/prompts/defaults/spec-questions.md +59 -0
  106. package/src/prompts/defaults/tests.md +72 -0
  107. package/src/prompts/engine.ts +137 -0
  108. package/src/prompts/evolution.ts +409 -0
  109. package/src/prompts/loader.ts +71 -0
  110. package/src/review/__tests__/review.test.ts +288 -0
  111. package/src/review/comprehensive.ts +362 -0
  112. package/src/review/index.ts +417 -0
  113. package/src/stats/__tests__/tracker.test.ts +323 -0
  114. package/src/stats/index.ts +11 -0
  115. package/src/stats/tracker.ts +492 -0
  116. package/src/ticket/__tests__/ticket.test.ts +273 -0
  117. package/src/ticket/index.ts +185 -0
  118. package/src/utils.ts +87 -0
  119. package/src/verify/__tests__/ai-review.test.ts +242 -0
  120. package/src/verify/__tests__/coverage.test.ts +83 -0
  121. package/src/verify/__tests__/detect.test.ts +175 -0
  122. package/src/verify/__tests__/diff-filter.test.ts +338 -0
  123. package/src/verify/__tests__/fix.test.ts +478 -0
  124. package/src/verify/__tests__/linters/clippy.test.ts +45 -0
  125. package/src/verify/__tests__/linters/go-vet.test.ts +27 -0
  126. package/src/verify/__tests__/linters/ruff.test.ts +64 -0
  127. package/src/verify/__tests__/mutation.test.ts +141 -0
  128. package/src/verify/__tests__/pipeline.test.ts +553 -0
  129. package/src/verify/__tests__/proof.test.ts +97 -0
  130. package/src/verify/__tests__/secretlint.test.ts +190 -0
  131. package/src/verify/__tests__/semgrep.test.ts +217 -0
  132. package/src/verify/__tests__/slop.test.ts +366 -0
  133. package/src/verify/__tests__/sonar.test.ts +113 -0
  134. package/src/verify/__tests__/syntax-guard.test.ts +227 -0
  135. package/src/verify/__tests__/trivy.test.ts +191 -0
  136. package/src/verify/__tests__/visual.test.ts +139 -0
  137. package/src/verify/ai-review.ts +276 -0
  138. package/src/verify/coverage.ts +134 -0
  139. package/src/verify/detect.ts +171 -0
  140. package/src/verify/diff-filter.ts +183 -0
  141. package/src/verify/fix.ts +317 -0
  142. package/src/verify/linters/clippy.ts +52 -0
  143. package/src/verify/linters/go-vet.ts +32 -0
  144. package/src/verify/linters/ruff.ts +47 -0
  145. package/src/verify/mutation.ts +143 -0
  146. package/src/verify/pipeline.ts +328 -0
  147. package/src/verify/proof.ts +277 -0
  148. package/src/verify/secretlint.ts +168 -0
  149. package/src/verify/semgrep.ts +170 -0
  150. package/src/verify/slop.ts +493 -0
  151. package/src/verify/sonar.ts +146 -0
  152. package/src/verify/syntax-guard.ts +251 -0
  153. package/src/verify/trivy.ts +161 -0
  154. package/src/verify/visual.ts +460 -0
  155. package/src/workflow/__tests__/context.test.ts +110 -0
  156. package/src/workflow/context.ts +81 -0
@@ -0,0 +1,66 @@
1
+ /**
2
+ * Validates AI-generated output for slop patterns before presenting to users.
3
+ *
4
+ * Catches: hallucinated imports, console.log suggestions, empty function bodies,
5
+ * bare TODOs without tickets, and other common AI slop in generated text.
6
+ */
7
+
8
+ export interface AIValidationResult {
9
+ clean: boolean;
10
+ warnings: string[];
11
+ sanitized: string;
12
+ }
13
+
14
+ const SLOP_PATTERNS: Array<{ pattern: RegExp; message: string }> = [
15
+ {
16
+ pattern: /console\.(log|warn|error|debug|info)\s*\(/g,
17
+ message: "AI suggested console.log — stripped",
18
+ },
19
+ {
20
+ pattern: /\/\/\s*(?:TO)(?:DO)(?!\s*[(#[])/g,
21
+ message: "AI generated TODO without ticket reference",
22
+ },
23
+ {
24
+ pattern:
25
+ /import\s+.*from\s+['"]\.\/(?:nonexistent|placeholder|example)['"]/g,
26
+ message: "AI hallucinated a placeholder import",
27
+ },
28
+ {
29
+ pattern: /function\s+\w+\s*\([^)]*\)\s*\{\s*\}/g,
30
+ message: "AI generated empty function body",
31
+ },
32
+ {
33
+ pattern: /(?:as any|: any\b)/g,
34
+ message: "AI used 'any' type — violates strict mode",
35
+ },
36
+ ];
37
+
38
+ /**
39
+ * Check AI-generated text for slop patterns.
40
+ * Returns warnings and optionally sanitized output.
41
+ */
42
+ export function validateAIOutput(text: string): AIValidationResult {
43
+ const warnings: string[] = [];
44
+ let sanitized = text;
45
+
46
+ for (const { pattern, message } of SLOP_PATTERNS) {
47
+ // Reset lastIndex for global patterns
48
+ pattern.lastIndex = 0;
49
+ if (pattern.test(text)) {
50
+ warnings.push(message);
51
+ }
52
+ pattern.lastIndex = 0;
53
+ }
54
+
55
+ // Sanitize: remove console.log lines from code suggestions
56
+ sanitized = sanitized.replace(
57
+ /^\s*console\.(log|warn|error|debug|info)\(.*\);?\s*$/gm,
58
+ "",
59
+ );
60
+
61
+ return {
62
+ clean: warnings.length === 0,
63
+ warnings,
64
+ sanitized: sanitized.trim(),
65
+ };
66
+ }
@@ -0,0 +1,525 @@
1
+ import { describe, expect, test } from "bun:test";
2
+ import {
3
+ buildReport,
4
+ buildTier3Report,
5
+ formatComparison,
6
+ formatTier3Comparison,
7
+ } from "../reporter";
8
+ import type { BenchmarkMetrics, StepMetrics, StoryConfig } from "../types";
9
+
10
+ const storyConfig: StoryConfig = {
11
+ name: "mitt",
12
+ description: "Tiny event emitter",
13
+ tier: 1,
14
+ source: "https://github.com/developit/mitt",
15
+ testFiles: ["tests/mitt.test.ts"],
16
+ metrics: { expectedTests: 18, originalLOC: 80, complexity: "easy" },
17
+ };
18
+
19
+ const mainaMetrics: BenchmarkMetrics = {
20
+ pipeline: "maina",
21
+ storyName: "mitt",
22
+ wallClockMs: 1200,
23
+ tokensInput: 5000,
24
+ tokensOutput: 2000,
25
+ testsTotal: 18,
26
+ testsPassed: 16,
27
+ testsFailed: 2,
28
+ verifyFindings: 3,
29
+ specQualityScore: 83,
30
+ implLOC: 85,
31
+ attemptsToPass: 1,
32
+ bugsIntroduced: 0,
33
+ toolsUsed: ["getContext", "verify", "reviewCode"],
34
+ };
35
+
36
+ const speckitMetrics: BenchmarkMetrics = {
37
+ pipeline: "speckit",
38
+ storyName: "mitt",
39
+ wallClockMs: 1800,
40
+ tokensInput: 7000,
41
+ tokensOutput: 3000,
42
+ testsTotal: 18,
43
+ testsPassed: 14,
44
+ testsFailed: 4,
45
+ verifyFindings: 0,
46
+ specQualityScore: 70,
47
+ implLOC: 112,
48
+ attemptsToPass: 2,
49
+ bugsIntroduced: 1,
50
+ toolsUsed: ["specify init", "constitution", "specs", "plans", "tasks"],
51
+ };
52
+
53
+ describe("buildReport", () => {
54
+ test("creates comparison report with both pipeline results", () => {
55
+ const report = buildReport(storyConfig, mainaMetrics, speckitMetrics);
56
+
57
+ expect(report.story.name).toBe("mitt");
58
+ expect(report.maina?.testsPassed).toBe(16);
59
+ expect(report.speckit?.testsPassed).toBe(14);
60
+ expect(report.timestamp).toBeTruthy();
61
+ });
62
+
63
+ test("determines winner based on test pass rate", () => {
64
+ const report = buildReport(storyConfig, mainaMetrics, speckitMetrics);
65
+ expect(report.winner).toBe("maina");
66
+ });
67
+
68
+ test("returns tie when both have same pass count", () => {
69
+ const tied = { ...speckitMetrics, testsPassed: 16, testsFailed: 2 };
70
+ const report = buildReport(storyConfig, mainaMetrics, tied);
71
+ expect(report.winner).toBe("tie");
72
+ });
73
+
74
+ test("returns incomplete when one pipeline is null", () => {
75
+ const report = buildReport(storyConfig, mainaMetrics, null);
76
+ expect(report.winner).toBe("incomplete");
77
+ expect(report.speckit).toBeNull();
78
+ });
79
+ });
80
+
81
+ describe("formatComparison", () => {
82
+ test("produces a readable terminal table", () => {
83
+ const report = buildReport(storyConfig, mainaMetrics, speckitMetrics);
84
+ const output = formatComparison(report);
85
+
86
+ expect(output).toContain("mitt");
87
+ expect(output).toContain("maina");
88
+ expect(output).toContain("speckit");
89
+ expect(output).toContain("16");
90
+ expect(output).toContain("14");
91
+ expect(output).toContain("Winner");
92
+ });
93
+
94
+ test("handles incomplete report gracefully", () => {
95
+ const report = buildReport(storyConfig, mainaMetrics, null);
96
+ const output = formatComparison(report);
97
+
98
+ expect(output).toContain("maina");
99
+ expect(output).toContain("—");
100
+ });
101
+ });
102
+
103
+ // --- Tier 3 fixtures ---
104
+
105
+ const tier3Story: StoryConfig = {
106
+ name: "auth-flow",
107
+ description: "Full auth lifecycle",
108
+ tier: 3,
109
+ source: "internal",
110
+ testFiles: ["tests/auth.test.ts"],
111
+ metrics: { expectedTests: 25, originalLOC: 400, complexity: "hard" },
112
+ };
113
+
114
+ function makeStep(
115
+ overrides: Partial<StepMetrics> & { name: string },
116
+ ): StepMetrics {
117
+ return {
118
+ durationMs: 100,
119
+ tokensInput: 500,
120
+ tokensOutput: 200,
121
+ artifacts: [],
122
+ ...overrides,
123
+ };
124
+ }
125
+
126
+ const mainaSteps: Record<string, StepMetrics> = {
127
+ clarify: makeStep({
128
+ name: "Clarify",
129
+ durationMs: 200,
130
+ tokensInput: 1000,
131
+ tokensOutput: 500,
132
+ questionsAsked: 3,
133
+ }),
134
+ spec: makeStep({
135
+ name: "Spec",
136
+ durationMs: 300,
137
+ tokensInput: 2000,
138
+ tokensOutput: 1000,
139
+ }),
140
+ plan: makeStep({
141
+ name: "Plan",
142
+ durationMs: 150,
143
+ tokensInput: 800,
144
+ tokensOutput: 400,
145
+ approachesProposed: 2,
146
+ }),
147
+ implement: makeStep({
148
+ name: "Implement",
149
+ durationMs: 500,
150
+ tokensInput: 3000,
151
+ tokensOutput: 2000,
152
+ loc: 120,
153
+ attempts: 2,
154
+ }),
155
+ test: makeStep({
156
+ name: "Test",
157
+ durationMs: 400,
158
+ tokensInput: 1500,
159
+ tokensOutput: 800,
160
+ testsGenerated: 25,
161
+ }),
162
+ verify: makeStep({
163
+ name: "Verify",
164
+ durationMs: 250,
165
+ tokensInput: 1200,
166
+ tokensOutput: 600,
167
+ findings: 4,
168
+ findingsBySeverity: { high: 1, medium: 3 },
169
+ }),
170
+ fix: makeStep({
171
+ name: "Fix",
172
+ durationMs: 180,
173
+ tokensInput: 900,
174
+ tokensOutput: 500,
175
+ }),
176
+ review: makeStep({
177
+ name: "Review",
178
+ durationMs: 300,
179
+ tokensInput: 1400,
180
+ tokensOutput: 700,
181
+ issuesFound: 2,
182
+ }),
183
+ final: makeStep({
184
+ name: "Final Check",
185
+ durationMs: 120,
186
+ tokensInput: 600,
187
+ tokensOutput: 300,
188
+ passed: true,
189
+ }),
190
+ };
191
+
192
+ const speckitSteps: Record<string, StepMetrics> = {
193
+ clarify: makeStep({
194
+ name: "Clarify",
195
+ durationMs: 250,
196
+ tokensInput: 1200,
197
+ tokensOutput: 600,
198
+ questionsAsked: 2,
199
+ }),
200
+ spec: makeStep({
201
+ name: "Spec",
202
+ durationMs: 400,
203
+ tokensInput: 2500,
204
+ tokensOutput: 1200,
205
+ }),
206
+ plan: makeStep({
207
+ name: "Plan",
208
+ durationMs: 200,
209
+ tokensInput: 1000,
210
+ tokensOutput: 500,
211
+ }),
212
+ implement: makeStep({
213
+ name: "Implement",
214
+ durationMs: 600,
215
+ tokensInput: 3500,
216
+ tokensOutput: 2500,
217
+ loc: 150,
218
+ attempts: 3,
219
+ }),
220
+ test: makeStep({
221
+ name: "Test",
222
+ durationMs: 350,
223
+ tokensInput: 1800,
224
+ tokensOutput: 900,
225
+ testsGenerated: 22,
226
+ }),
227
+ verify: makeStep({
228
+ name: "Verify",
229
+ durationMs: 200,
230
+ tokensInput: 1000,
231
+ tokensOutput: 500,
232
+ findings: 2,
233
+ }),
234
+ fix: makeStep({
235
+ name: "Fix",
236
+ durationMs: 220,
237
+ tokensInput: 1100,
238
+ tokensOutput: 600,
239
+ }),
240
+ review: makeStep({
241
+ name: "Review",
242
+ durationMs: 280,
243
+ tokensInput: 1300,
244
+ tokensOutput: 650,
245
+ }),
246
+ final: makeStep({
247
+ name: "Final Check",
248
+ durationMs: 150,
249
+ tokensInput: 700,
250
+ tokensOutput: 350,
251
+ passed: true,
252
+ }),
253
+ };
254
+
255
+ const mainaMeta = {
256
+ bugsIntroduced: 1,
257
+ bugsCaught: 3,
258
+ testsPassed: 24,
259
+ testsTotal: 25,
260
+ };
261
+ const speckitMeta = {
262
+ bugsIntroduced: 2,
263
+ bugsCaught: 2,
264
+ testsPassed: 20,
265
+ testsTotal: 25,
266
+ };
267
+
268
+ describe("buildTier3Report", () => {
269
+ test("computes totals by summing step durations and tokens", () => {
270
+ const report = buildTier3Report(tier3Story, mainaSteps, speckitSteps, [], {
271
+ maina: mainaMeta,
272
+ speckit: speckitMeta,
273
+ });
274
+
275
+ // Maina duration: 200+300+150+500+400+250+180+300+120 = 2400
276
+ expect(report.maina.totals.durationMs).toBe(2400);
277
+ // Maina tokensInput: 1000+2000+800+3000+1500+1200+900+1400+600 = 12400
278
+ expect(report.maina.totals.tokensInput).toBe(12400);
279
+ // Maina tokensOutput: 500+1000+400+2000+800+600+500+700+300 = 6800
280
+ expect(report.maina.totals.tokensOutput).toBe(6800);
281
+
282
+ // SpecKit duration: 250+400+200+600+350+200+220+280+150 = 2650
283
+ expect(report.speckit.totals.durationMs).toBe(2650);
284
+ });
285
+
286
+ test("carries bug/test metadata into totals", () => {
287
+ const report = buildTier3Report(
288
+ tier3Story,
289
+ mainaSteps,
290
+ speckitSteps,
291
+ ["learning 1"],
292
+ {
293
+ maina: mainaMeta,
294
+ speckit: speckitMeta,
295
+ },
296
+ );
297
+
298
+ expect(report.maina.totals.bugsIntroduced).toBe(1);
299
+ expect(report.maina.totals.bugsCaught).toBe(3);
300
+ expect(report.maina.totals.testsPassed).toBe(24);
301
+ expect(report.speckit.totals.bugsIntroduced).toBe(2);
302
+ expect(report.speckit.totals.testsPassed).toBe(20);
303
+ });
304
+
305
+ test("determines winner by test pass rate first", () => {
306
+ const report = buildTier3Report(tier3Story, mainaSteps, speckitSteps, [], {
307
+ maina: mainaMeta,
308
+ speckit: speckitMeta,
309
+ });
310
+ // Maina 24/25 > SpecKit 20/25
311
+ expect(report.winner).toBe("maina");
312
+ });
313
+
314
+ test("breaks tie on bugs caught", () => {
315
+ const report = buildTier3Report(tier3Story, mainaSteps, speckitSteps, [], {
316
+ maina: {
317
+ bugsIntroduced: 1,
318
+ bugsCaught: 5,
319
+ testsPassed: 20,
320
+ testsTotal: 25,
321
+ },
322
+ speckit: {
323
+ bugsIntroduced: 1,
324
+ bugsCaught: 2,
325
+ testsPassed: 20,
326
+ testsTotal: 25,
327
+ },
328
+ });
329
+ // Same pass rate, maina caught more bugs
330
+ expect(report.winner).toBe("maina");
331
+ });
332
+
333
+ test("breaks second tie on duration (lower wins)", () => {
334
+ // mainaSteps total = 2400, speckitSteps total = 2650
335
+ const report = buildTier3Report(tier3Story, mainaSteps, speckitSteps, [], {
336
+ maina: {
337
+ bugsIntroduced: 0,
338
+ bugsCaught: 3,
339
+ testsPassed: 20,
340
+ testsTotal: 25,
341
+ },
342
+ speckit: {
343
+ bugsIntroduced: 0,
344
+ bugsCaught: 3,
345
+ testsPassed: 20,
346
+ testsTotal: 25,
347
+ },
348
+ });
349
+ // Same pass rate, same bugs caught, maina is faster
350
+ expect(report.winner).toBe("maina");
351
+ });
352
+
353
+ test("returns tie when all tiebreakers are equal", () => {
354
+ const sameSteps: Record<string, StepMetrics> = {
355
+ step1: makeStep({
356
+ name: "Step 1",
357
+ durationMs: 100,
358
+ tokensInput: 500,
359
+ tokensOutput: 200,
360
+ }),
361
+ };
362
+ const report = buildTier3Report(
363
+ tier3Story,
364
+ sameSteps,
365
+ { ...sameSteps },
366
+ [],
367
+ {
368
+ maina: {
369
+ bugsIntroduced: 0,
370
+ bugsCaught: 1,
371
+ testsPassed: 10,
372
+ testsTotal: 10,
373
+ },
374
+ speckit: {
375
+ bugsIntroduced: 0,
376
+ bugsCaught: 1,
377
+ testsPassed: 10,
378
+ testsTotal: 10,
379
+ },
380
+ },
381
+ );
382
+ expect(report.winner).toBe("tie");
383
+ });
384
+
385
+ test("returns incomplete when one pipeline has no steps", () => {
386
+ const report = buildTier3Report(tier3Story, mainaSteps, {}, [
387
+ "partial run",
388
+ ]);
389
+ expect(report.winner).toBe("incomplete");
390
+ });
391
+
392
+ test("defaults meta to zeros when not provided", () => {
393
+ const report = buildTier3Report(tier3Story, mainaSteps, speckitSteps, []);
394
+ expect(report.maina.totals.bugsIntroduced).toBe(0);
395
+ expect(report.maina.totals.bugsCaught).toBe(0);
396
+ expect(report.maina.totals.testsPassed).toBe(0);
397
+ expect(report.speckit.totals.testsPassed).toBe(0);
398
+ });
399
+
400
+ test("includes story, timestamp, and learnings", () => {
401
+ const report = buildTier3Report(
402
+ tier3Story,
403
+ mainaSteps,
404
+ speckitSteps,
405
+ ["insight A", "insight B"],
406
+ {
407
+ maina: mainaMeta,
408
+ speckit: speckitMeta,
409
+ },
410
+ );
411
+
412
+ expect(report.story.name).toBe("auth-flow");
413
+ expect(report.timestamp).toBeTruthy();
414
+ expect(report.learnings).toEqual(["insight A", "insight B"]);
415
+ });
416
+
417
+ test("preserves per-step data in the result", () => {
418
+ const report = buildTier3Report(tier3Story, mainaSteps, speckitSteps, [], {
419
+ maina: mainaMeta,
420
+ speckit: speckitMeta,
421
+ });
422
+
423
+ expect(report.maina.steps.clarify?.questionsAsked).toBe(3);
424
+ expect(report.maina.steps.implement?.loc).toBe(120);
425
+ expect(report.speckit.steps.verify?.findings).toBe(2);
426
+ });
427
+ });
428
+
429
+ describe("formatTier3Comparison", () => {
430
+ test("produces a table with per-step breakdown", () => {
431
+ const report = buildTier3Report(
432
+ tier3Story,
433
+ mainaSteps,
434
+ speckitSteps,
435
+ ["Maina faster on verify"],
436
+ {
437
+ maina: mainaMeta,
438
+ speckit: speckitMeta,
439
+ },
440
+ );
441
+ const output = formatTier3Comparison(report);
442
+
443
+ // Header
444
+ expect(output).toContain("Tier 3 Benchmark: auth-flow");
445
+ expect(output).toContain("Step");
446
+ expect(output).toContain("Maina (ms)");
447
+ expect(output).toContain("SpecKit (ms)");
448
+ expect(output).toContain("Maina (tokens)");
449
+ expect(output).toContain("SpecKit (tokens)");
450
+
451
+ // Step rows — check a few step names appear
452
+ expect(output).toContain("Clarify");
453
+ expect(output).toContain("Implement");
454
+ expect(output).toContain("Verify");
455
+ expect(output).toContain("Final Check");
456
+
457
+ // Totals
458
+ expect(output).toContain("TOTAL");
459
+ expect(output).toContain("2400"); // maina total ms
460
+ expect(output).toContain("2650"); // speckit total ms
461
+
462
+ // Findings summary
463
+ expect(output).toContain("bugs introduced: 1");
464
+ expect(output).toContain("bugs caught: 3");
465
+ expect(output).toContain("tests: 24/25");
466
+ expect(output).toContain("tests: 20/25");
467
+
468
+ // Winner
469
+ expect(output).toContain("Winner: maina");
470
+
471
+ // Learnings
472
+ expect(output).toContain("Learnings:");
473
+ expect(output).toContain("Maina faster on verify");
474
+ });
475
+
476
+ test("shows dash for missing steps", () => {
477
+ const partialSpeckit: Record<string, StepMetrics> = {
478
+ clarify: makeStep({
479
+ name: "Clarify",
480
+ durationMs: 250,
481
+ tokensInput: 1200,
482
+ tokensOutput: 600,
483
+ }),
484
+ // Missing all other steps that maina has
485
+ };
486
+ const report = buildTier3Report(
487
+ tier3Story,
488
+ mainaSteps,
489
+ partialSpeckit,
490
+ [],
491
+ {
492
+ maina: mainaMeta,
493
+ speckit: {
494
+ bugsIntroduced: 0,
495
+ bugsCaught: 0,
496
+ testsPassed: 0,
497
+ testsTotal: 0,
498
+ },
499
+ },
500
+ );
501
+ const output = formatTier3Comparison(report);
502
+
503
+ // Speckit should show dashes for steps it doesn't have
504
+ // The Implement row should have speckit values as "—"
505
+ // We check that "—" appears in the output (for missing speckit steps)
506
+ expect(output).toContain("—");
507
+ });
508
+
509
+ test("omits learnings section when empty", () => {
510
+ const report = buildTier3Report(tier3Story, mainaSteps, speckitSteps, [], {
511
+ maina: mainaMeta,
512
+ speckit: speckitMeta,
513
+ });
514
+ const output = formatTier3Comparison(report);
515
+
516
+ expect(output).not.toContain("Learnings:");
517
+ });
518
+
519
+ test("formats incomplete report correctly", () => {
520
+ const report = buildTier3Report(tier3Story, mainaSteps, {}, []);
521
+ const output = formatTier3Comparison(report);
522
+
523
+ expect(output).toContain("Winner: incomplete");
524
+ });
525
+ });
@@ -0,0 +1,113 @@
1
+ import { afterEach, beforeEach, describe, expect, test } from "bun:test";
2
+ import { mkdirSync, rmSync, writeFileSync } from "node:fs";
3
+ import { join } from "node:path";
4
+
5
+ import { parseTestOutput, runBenchmark } from "../runner";
6
+
7
+ let tmpDir: string;
8
+
9
+ beforeEach(() => {
10
+ tmpDir = join(
11
+ import.meta.dir,
12
+ `tmp-runner-${Date.now()}-${Math.random().toString(36).slice(2)}`,
13
+ );
14
+ mkdirSync(tmpDir, { recursive: true });
15
+ });
16
+
17
+ afterEach(() => {
18
+ try {
19
+ rmSync(tmpDir, { recursive: true, force: true });
20
+ } catch {
21
+ // ignore
22
+ }
23
+ });
24
+
25
+ describe("parseTestOutput", () => {
26
+ test("parses bun test output with pass and fail counts", () => {
27
+ const output = `bun test v1.3.8
28
+ 15 pass
29
+ 3 fail
30
+ 42 expect() calls
31
+ Ran 18 tests across 1 file. [120.00ms]`;
32
+
33
+ const result = parseTestOutput(output);
34
+ expect(result.passed).toBe(15);
35
+ expect(result.failed).toBe(3);
36
+ expect(result.total).toBe(18);
37
+ });
38
+
39
+ test("parses output with only passes", () => {
40
+ const output = `bun test v1.3.8
41
+ 18 pass
42
+ 0 fail
43
+ Ran 18 tests across 1 file. [100.00ms]`;
44
+
45
+ const result = parseTestOutput(output);
46
+ expect(result.passed).toBe(18);
47
+ expect(result.failed).toBe(0);
48
+ expect(result.total).toBe(18);
49
+ });
50
+
51
+ test("returns zeros for unparseable output", () => {
52
+ const result = parseTestOutput("something went wrong");
53
+ expect(result.passed).toBe(0);
54
+ expect(result.failed).toBe(0);
55
+ expect(result.total).toBe(0);
56
+ });
57
+ });
58
+
59
+ describe("runBenchmark", () => {
60
+ test("runs test file in temp dir and returns metrics", async () => {
61
+ // Create a simple passing test
62
+ const testFile = join(tmpDir, "test.ts");
63
+ writeFileSync(
64
+ testFile,
65
+ `import { test, expect } from "bun:test";
66
+ test("1+1=2", () => { expect(1+1).toBe(2); });
67
+ test("true", () => { expect(true).toBe(true); });
68
+ `,
69
+ );
70
+
71
+ const result = await runBenchmark({
72
+ pipeline: "maina",
73
+ storyName: "test-story",
74
+ testFiles: [testFile],
75
+ implDir: tmpDir,
76
+ });
77
+
78
+ expect(result.ok).toBe(true);
79
+ if (result.ok) {
80
+ expect(result.value.pipeline).toBe("maina");
81
+ expect(result.value.storyName).toBe("test-story");
82
+ expect(result.value.testsPassed).toBe(2);
83
+ expect(result.value.testsFailed).toBe(0);
84
+ expect(result.value.testsTotal).toBe(2);
85
+ expect(result.value.wallClockMs).toBeGreaterThan(0);
86
+ }
87
+ });
88
+
89
+ test("captures failures in metrics", async () => {
90
+ const testFile = join(tmpDir, "fail.ts");
91
+ writeFileSync(
92
+ testFile,
93
+ `import { test, expect } from "bun:test";
94
+ test("pass", () => { expect(true).toBe(true); });
95
+ test("fail", () => { expect(1).toBe(2); });
96
+ `,
97
+ );
98
+
99
+ const result = await runBenchmark({
100
+ pipeline: "maina",
101
+ storyName: "fail-story",
102
+ testFiles: [testFile],
103
+ implDir: tmpDir,
104
+ });
105
+
106
+ expect(result.ok).toBe(true);
107
+ if (result.ok) {
108
+ expect(result.value.testsPassed).toBe(1);
109
+ expect(result.value.testsFailed).toBe(1);
110
+ expect(result.value.testsTotal).toBe(2);
111
+ }
112
+ });
113
+ });