@stupify/cli 0.0.15 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/.review/CORPUS.md +73 -0
  2. package/.review/REVIEW-PROMPT.md +52 -0
  3. package/.review/RUBRIC.md +46 -0
  4. package/LICENSE +1 -1
  5. package/README.md +41 -39
  6. package/package.json +24 -25
  7. package/src/cli.ts +358 -0
  8. package/src/review-sweep.ts +492 -0
  9. package/dist/analysis.d.ts +0 -16
  10. package/dist/analysis.js +0 -165
  11. package/dist/cache.d.ts +0 -2
  12. package/dist/cache.js +0 -57
  13. package/dist/checks.d.ts +0 -4
  14. package/dist/checks.js +0 -228
  15. package/dist/command.d.ts +0 -2
  16. package/dist/command.js +0 -147
  17. package/dist/constants.d.ts +0 -4
  18. package/dist/constants.js +0 -53
  19. package/dist/counter-scout.d.ts +0 -21
  20. package/dist/counter-scout.js +0 -167
  21. package/dist/diff.d.ts +0 -1
  22. package/dist/diff.js +0 -10
  23. package/dist/doctor.d.ts +0 -4
  24. package/dist/doctor.js +0 -131
  25. package/dist/git.d.ts +0 -12
  26. package/dist/git.js +0 -298
  27. package/dist/hooks.d.ts +0 -3
  28. package/dist/hooks.js +0 -117
  29. package/dist/index.d.ts +0 -1
  30. package/dist/index.js +0 -1
  31. package/dist/model.d.ts +0 -11
  32. package/dist/model.js +0 -296
  33. package/dist/prompts.d.ts +0 -8
  34. package/dist/prompts.js +0 -89
  35. package/dist/render.d.ts +0 -3
  36. package/dist/render.js +0 -151
  37. package/dist/repomix-provider.d.ts +0 -12
  38. package/dist/repomix-provider.js +0 -196
  39. package/dist/search-bench.d.ts +0 -1
  40. package/dist/search-bench.js +0 -677
  41. package/dist/search-profile.d.ts +0 -6
  42. package/dist/search-profile.js +0 -73
  43. package/dist/sem-provider.d.ts +0 -2
  44. package/dist/sem-provider.js +0 -252
  45. package/dist/stupify.d.ts +0 -38
  46. package/dist/stupify.js +0 -474
  47. package/dist/trace.d.ts +0 -31
  48. package/dist/trace.js +0 -86
  49. package/dist/types.d.ts +0 -328
  50. package/dist/types.js +0 -6
  51. package/dist/ui.d.ts +0 -34
  52. package/dist/ui.js +0 -143
  53. package/src/analysis.ts +0 -220
  54. package/src/cache.ts +0 -63
  55. package/src/checks.ts +0 -231
  56. package/src/command.ts +0 -173
  57. package/src/constants.ts +0 -56
  58. package/src/counter-scout.ts +0 -195
  59. package/src/diff.ts +0 -9
  60. package/src/doctor.ts +0 -140
  61. package/src/git.ts +0 -306
  62. package/src/hooks.ts +0 -134
  63. package/src/index.ts +0 -1
  64. package/src/model.ts +0 -367
  65. package/src/prompts.ts +0 -100
  66. package/src/render.ts +0 -154
  67. package/src/repomix-provider.ts +0 -219
  68. package/src/search-bench.ts +0 -783
  69. package/src/search-profile.ts +0 -89
  70. package/src/sem-provider.ts +0 -297
  71. package/src/stupify.ts +0 -571
  72. package/src/trace.ts +0 -126
  73. package/src/types.ts +0 -348
  74. package/src/ui.ts +0 -187
@@ -1,783 +0,0 @@
1
- import { execFile } from "node:child_process";
2
- import { mkdir, mkdtemp, readFile, readdir, rm, writeFile } from "node:fs/promises";
3
- import { tmpdir } from "node:os";
4
- import path from "node:path";
5
- import { promisify } from "node:util";
6
- import type {
7
- SearchBenchConfig,
8
- SearchBenchCommitReplay,
9
- SearchBenchReplayRun,
10
- SearchBenchRun,
11
- SearchBenchSmokeRun,
12
- SearchFixture,
13
- SearchFixtureExpectation,
14
- SearchMatch,
15
- SearchProfile,
16
- SearchRunJson,
17
- } from "./types.ts";
18
-
19
- const execFileAsync = promisify(execFile);
20
-
21
- type ProfileResult = Readonly<{
22
- profileId: string;
23
- fixtureScore: number;
24
- falsePositives: number;
25
- falseNegatives: number;
26
- truePositives: number;
27
- trueNegatives: number;
28
- wrongPatterns: number;
29
- assignedCheckFalsePositives: number;
30
- avgMs: number;
31
- smokeMatches: number;
32
- smokeSkipped: number;
33
- matchesUsingCounterReasonAsProof: number;
34
- decision: string;
35
- }>;
36
-
37
- type BenchSummary = Readonly<{
38
- name: string;
39
- outputDir: string;
40
- generatedAt: string;
41
- runs: readonly SearchBenchRun[];
42
- realReplayRuns: readonly SearchBenchReplayRun[];
43
- leaderboard: readonly ProfileResult[];
44
- perCheck: readonly CheckResult[];
45
- }>;
46
-
47
- type CheckResult = Readonly<{
48
- checkId: string;
49
- truePositives: number;
50
- falsePositives: number;
51
- falseNegatives: number;
52
- wrongPatterns: number;
53
- assignedCheckFalsePositives: number;
54
- decision: string;
55
- }>;
56
-
57
- export async function runSearchBench(configPath: string): Promise<string> {
58
- const startedAt = new Date();
59
- const configFile = path.resolve(configPath);
60
- const configDir = path.dirname(configFile);
61
- const config = JSON.parse(await readFile(configFile, "utf8")) as SearchBenchConfig;
62
- const outputDir = path.resolve(
63
- "experiments/results",
64
- `${safeSegment(config.name)}-${startedAt.toISOString().replace(/[:.]/g, "-")}`,
65
- );
66
- const profilesDir = path.join(outputDir, "profiles");
67
- const runsDir = path.join(outputDir, "runs");
68
- const replayDir = path.join(outputDir, "real-replay");
69
- await mkdir(profilesDir, { recursive: true });
70
- await mkdir(runsDir, { recursive: true });
71
- await mkdir(replayDir, { recursive: true });
72
-
73
- const profilePaths = await resolveProfilePaths(config.profiles, configDir);
74
- const profiles = await Promise.all(profilePaths.map(readProfile));
75
- await Promise.all(profiles.map(({ profile, filePath }) =>
76
- writeFile(path.join(profilesDir, `${safeSegment(profile.id)}.json`), JSON.stringify({ source: filePath, ...profile }, null, 2)),
77
- ));
78
-
79
- const fixturePaths = await resolveGlob(config.fixtures, configDir);
80
- const fixtures = await Promise.all(fixturePaths.map(readFixture));
81
- const allRuns: SearchBenchRun[] = [];
82
- const replayRuns: SearchBenchReplayRun[] = [];
83
-
84
- for (const { profile, filePath: profilePath } of profiles) {
85
- for (const { fixture } of fixtures) {
86
- const run = await runFixture(profile.id, profilePath, fixture);
87
- allRuns.push(run);
88
- await writeRunFiles(runsDir, `${fixture.id}__${profile.id}`, run, fixture.description);
89
- }
90
- for (const smoke of config.realSmokeRuns ?? []) {
91
- const run = await runSmoke(profile.id, profilePath, smoke);
92
- allRuns.push(run);
93
- await writeRunFiles(runsDir, `${smoke.id}__${profile.id}`, run, "Real repo smoke run");
94
- }
95
- }
96
-
97
- for (const replay of config.realCommitReplay ?? []) {
98
- const runs = await runCommitReplay(replay, profiles, replayDir);
99
- replayRuns.push(...runs);
100
- }
101
-
102
- const leaderboard = summarize(profiles.map(({ profile }) => profile), allRuns);
103
- const perCheck = summarizeByCheck(allRuns);
104
- const summary: BenchSummary = {
105
- name: config.name,
106
- outputDir,
107
- generatedAt: startedAt.toISOString(),
108
- runs: allRuns,
109
- realReplayRuns: replayRuns,
110
- leaderboard,
111
- perCheck,
112
- };
113
- await writeFile(path.join(outputDir, "summary.json"), JSON.stringify(summary, null, 2));
114
- const leaderboardText = renderLeaderboard(leaderboard, perCheck);
115
- await writeFile(path.join(outputDir, "leaderboard.md"), leaderboardText);
116
- await writeFile(path.join(outputDir, "real-replay-summary.json"), JSON.stringify(replayRuns, null, 2));
117
- await writeFile(path.join(outputDir, "real-replay.md"), renderReplayMarkdown(replayRuns));
118
- await writeFile(path.join(outputDir, "real-replay-review.md"), renderReplayReviewMarkdown(replayRuns));
119
-
120
- return `Search bench complete.
121
- Results: ${outputDir}
122
-
123
- ${leaderboardText}`;
124
- }
125
-
126
- async function runFixture(profileId: string, profilePath: string, fixture: SearchFixture): Promise<SearchBenchRun> {
127
- const tempDir = await mkdtemp(path.join(tmpdir(), "stupify-search-fixture-"));
128
- try {
129
- await execFileAsync("git", ["init", "-q"], { cwd: tempDir });
130
- const patchPath = path.join(tempDir, "fixture.patch");
131
- await writeFile(patchPath, fixture.stagedPatch);
132
- await execFileAsync("git", ["apply", "--recount", "--whitespace=nowarn", patchPath], { cwd: tempDir, maxBuffer: 32 * 1024 * 1024 });
133
- await rm(patchPath, { force: true });
134
- await execFileAsync("git", ["add", "-A"], { cwd: tempDir });
135
- const result = await runCli(tempDir, ["--staged", "--json", "--search-profile", profilePath]);
136
- const run = resultToBenchRun(profileId, result, { fixtureId: fixture.id, expected: fixture.expected });
137
- return {
138
- ...run,
139
- score: scoreFixtureRun(run, fixture.expected),
140
- };
141
- } catch (error) {
142
- return errorRun(profileId, { fixtureId: fixture.id, expected: fixture.expected }, error);
143
- } finally {
144
- await rm(tempDir, { recursive: true, force: true });
145
- }
146
- }
147
-
148
- async function runSmoke(profileId: string, profilePath: string, smoke: SearchBenchSmokeRun): Promise<SearchBenchRun> {
149
- const cwd = resolveSmokeCwd(smoke.cwd);
150
- if (!cwd) {
151
- return {
152
- profileId,
153
- smokeId: smoke.id,
154
- elapsedMs: 0,
155
- modelCalls: 0,
156
- patterns: [],
157
- targets: 0,
158
- targetsByPattern: {},
159
- inputTokens: 0,
160
- skipped: true,
161
- skipReason: "missing_cwd",
162
- matches: [],
163
- targetsPreview: [],
164
- matchesUsingCounterReasonAsProof: 0,
165
- score: -5,
166
- error: "Smoke cwd is not configured. Set BEVYL_REPO or provide cwd.",
167
- };
168
- }
169
- try {
170
- const result = await runCli(cwd, [...smoke.args, "--json", "--search-profile", profilePath]);
171
- const run = resultToBenchRun(profileId, result, { smokeId: smoke.id });
172
- return {
173
- ...run,
174
- score: scoreSmokeRun(run),
175
- };
176
- } catch (error) {
177
- return errorRun(profileId, { smokeId: smoke.id }, error);
178
- }
179
- }
180
-
181
- type ReplayCommit = Readonly<{ sha: string; shortSha: string }>;
182
-
183
- async function runCommitReplay(
184
- replay: SearchBenchCommitReplay,
185
- profiles: readonly Readonly<{ filePath: string; profile: SearchProfile }>[],
186
- replayDir: string,
187
- ): Promise<readonly SearchBenchReplayRun[]> {
188
- const cwd = resolveReplayCwd(replay);
189
- if (!cwd) {
190
- return replay.profiles.map((profileId) => replayErrorRun(replay.id, profileId, { sha: "", shortSha: "(none)" }, new Error(`Replay cwd is not configured. Set ${replay.repoEnv ?? "repo env"} or provide cwd.`)));
191
- }
192
-
193
- const commits = await replayCommits(cwd, replay);
194
- const profilesById = new Map(profiles.map((profile) => [profile.profile.id, profile]));
195
- const runs: SearchBenchReplayRun[] = [];
196
- for (const commit of commits) {
197
- for (const profileId of replay.profiles) {
198
- const profile = profilesById.get(profileId);
199
- const run = profile
200
- ? await runReplayCommit(cwd, replay.id, commit, profile.profile.id, profile.filePath)
201
- : replayErrorRun(replay.id, profileId, commit, new Error(`Unknown replay profile: ${profileId}`));
202
- runs.push(run);
203
- await writeFile(
204
- path.join(replayDir, `${safeSegment(replay.id)}__${safeSegment(commit.shortSha)}__${safeSegment(profileId)}.json`),
205
- JSON.stringify(run, null, 2),
206
- );
207
- }
208
- }
209
- return runs;
210
- }
211
-
212
- async function replayCommits(cwd: string, replay: SearchBenchCommitReplay): Promise<readonly ReplayCommit[]> {
213
- const args = ["log", "--format=%H", `-${replay.limit}`];
214
- if (replay.nonMerge) args.push("--no-merges");
215
- if (replay.since) args.push(`--since=${replay.since}`);
216
- const { stdout } = await execFileAsync("git", args, { cwd, maxBuffer: 32 * 1024 * 1024 });
217
- return stdout
218
- .split(/\r?\n/)
219
- .map((sha) => sha.trim())
220
- .filter(Boolean)
221
- .map((sha) => ({ sha, shortSha: sha.slice(0, 7) }));
222
- }
223
-
224
- async function runReplayCommit(
225
- repoCwd: string,
226
- replayId: string,
227
- commit: ReplayCommit,
228
- profileId: string,
229
- profilePath: string,
230
- ): Promise<SearchBenchReplayRun> {
231
- const tempDir = await mkdtemp(path.join(tmpdir(), "stupify-replay-"));
232
- let worktreeAdded = false;
233
- try {
234
- const parent = `${commit.sha}^`;
235
- const stats = await commitStats(repoCwd, parent, commit.sha);
236
- await execFileAsync("git", ["worktree", "add", "--detach", tempDir, parent], { cwd: repoCwd, maxBuffer: 64 * 1024 * 1024 });
237
- worktreeAdded = true;
238
- const { stdout: patch } = await execFileAsync("git", ["diff", "--binary", parent, commit.sha], { cwd: repoCwd, maxBuffer: 128 * 1024 * 1024 });
239
- const patchPath = path.join(tempDir, "commit.patch");
240
- await writeFile(patchPath, patch);
241
- await execFileAsync("git", ["apply", "--cached", "--whitespace=nowarn", patchPath], { cwd: tempDir, maxBuffer: 128 * 1024 * 1024 });
242
- await rm(patchPath, { force: true });
243
- const result = await runCli(tempDir, ["--staged", "--json", "--search-profile", profilePath]);
244
- return replayResult(replayId, profileId, commit, result, stats);
245
- } catch (error) {
246
- return replayErrorRun(replayId, profileId, commit, error);
247
- } finally {
248
- if (worktreeAdded) {
249
- await execFileAsync("git", ["worktree", "remove", "--force", tempDir], { cwd: repoCwd, maxBuffer: 64 * 1024 * 1024 }).catch(async () => {
250
- await rm(tempDir, { recursive: true, force: true });
251
- await execFileAsync("git", ["worktree", "prune"], { cwd: repoCwd }).catch(() => undefined);
252
- });
253
- } else {
254
- await rm(tempDir, { recursive: true, force: true });
255
- }
256
- }
257
- }
258
-
259
- async function commitStats(cwd: string, parent: string, target: string): Promise<Readonly<{ changedFiles: number; addedLines: number; deletedLines: number }>> {
260
- const { stdout } = await execFileAsync("git", ["diff", "--numstat", parent, target], { cwd, maxBuffer: 32 * 1024 * 1024 });
261
- let changedFiles = 0;
262
- let addedLines = 0;
263
- let deletedLines = 0;
264
- for (const line of stdout.split(/\r?\n/).filter(Boolean)) {
265
- const [added, deleted] = line.split(/\s+/);
266
- changedFiles += 1;
267
- addedLines += numericStat(added);
268
- deletedLines += numericStat(deleted);
269
- }
270
- return { changedFiles, addedLines, deletedLines };
271
- }
272
-
273
- function replayResult(
274
- replayId: string,
275
- profileId: string,
276
- commit: ReplayCommit,
277
- result: SearchRunJson,
278
- stats: Readonly<{ changedFiles: number; addedLines: number; deletedLines: number }>,
279
- ): SearchBenchReplayRun {
280
- return {
281
- replayId,
282
- profileId,
283
- commitId: commit.shortSha,
284
- outcome: replayOutcome(result),
285
- changedFiles: stats.changedFiles,
286
- addedLines: stats.addedLines,
287
- deletedLines: stats.deletedLines,
288
- elapsedMs: result.stats.elapsedMs,
289
- skipped: result.stats.skipped ?? false,
290
- skipReason: result.stats.skipReason,
291
- targets: result.stats.searchTargets ?? result.stats.candidates ?? 0,
292
- inputTokens: result.stats.inputTokens ?? 0,
293
- repomixPackedTokens: result.stats.repomixTokens,
294
- modelCalls: result.stats.modelCalls,
295
- matches: result.matches,
296
- matchesByPattern: countMatches(result.matches),
297
- };
298
- }
299
-
300
- function replayOutcome(result: SearchRunJson): SearchBenchReplayRun["outcome"] {
301
- if (result.stats.skipReason === "input_too_large") return "skipped_input_too_large";
302
- if (result.stats.skipReason === "no_candidates") return "no_candidates";
303
- if (result.matches.length > 0) return "ran_with_matches";
304
- return "ran_no_matches";
305
- }
306
-
307
- function replayErrorRun(
308
- replayId: string,
309
- profileId: string,
310
- commit: ReplayCommit,
311
- error: unknown,
312
- ): SearchBenchReplayRun {
313
- return {
314
- replayId,
315
- profileId,
316
- commitId: commit.shortSha,
317
- outcome: "error",
318
- changedFiles: 0,
319
- addedLines: 0,
320
- deletedLines: 0,
321
- elapsedMs: 0,
322
- skipped: true,
323
- skipReason: "error",
324
- targets: 0,
325
- inputTokens: 0,
326
- modelCalls: 0,
327
- matches: [],
328
- matchesByPattern: {},
329
- error: error instanceof Error ? error.message : String(error),
330
- };
331
- }
332
-
333
- async function runCli(cwd: string, args: readonly string[]): Promise<SearchRunJson> {
334
- const startedAt = Date.now();
335
- const cliPath = process.argv[1];
336
- if (!cliPath) throw new Error("Could not resolve current CLI entrypoint.");
337
- const { stdout } = await execFileAsync(process.execPath, [cliPath, ...args], {
338
- cwd,
339
- env: process.env,
340
- maxBuffer: 128 * 1024 * 1024,
341
- });
342
- const parsed = JSON.parse(stdout) as SearchRunJson;
343
- return {
344
- ...parsed,
345
- stats: {
346
- ...parsed.stats,
347
- elapsedMs: parsed.stats.elapsedMs || Date.now() - startedAt,
348
- },
349
- };
350
- }
351
-
352
- function resultToBenchRun(
353
- profileId: string,
354
- result: SearchRunJson,
355
- identity: Readonly<{ fixtureId?: string; smokeId?: string; expected?: readonly SearchFixtureExpectation[] }>,
356
- ): SearchBenchRun {
357
- return {
358
- profileId,
359
- fixtureId: identity.fixtureId,
360
- smokeId: identity.smokeId,
361
- elapsedMs: result.stats.elapsedMs,
362
- modelCalls: result.stats.modelCalls,
363
- patterns: result.patterns,
364
- targets: result.stats.searchTargets ?? result.stats.candidates ?? 0,
365
- targetsByPattern: result.stats.targetsByPattern ?? {},
366
- inputTokens: result.stats.inputTokens ?? 0,
367
- repomixPackedTokens: result.stats.repomixTokens,
368
- skipped: result.stats.skipped ?? false,
369
- skipReason: result.stats.skipReason,
370
- matches: result.matches,
371
- expected: identity.expected,
372
- targetsPreview: result.stats.targetsPreview ?? [],
373
- matchesUsingCounterReasonAsProof: countCounterReasonProofs(result.matches),
374
- };
375
- }
376
-
377
- function errorRun(
378
- profileId: string,
379
- identity: Readonly<{ fixtureId?: string; smokeId?: string; expected?: readonly SearchFixtureExpectation[] }>,
380
- error: unknown,
381
- ): SearchBenchRun {
382
- return {
383
- profileId,
384
- fixtureId: identity.fixtureId,
385
- smokeId: identity.smokeId,
386
- elapsedMs: 0,
387
- modelCalls: 0,
388
- patterns: [],
389
- targets: 0,
390
- targetsByPattern: {},
391
- inputTokens: 0,
392
- skipped: true,
393
- skipReason: "error",
394
- matches: [],
395
- expected: identity.expected,
396
- targetsPreview: [],
397
- matchesUsingCounterReasonAsProof: 0,
398
- score: identity.fixtureId ? -3 : -5,
399
- error: error instanceof Error ? error.message : String(error),
400
- };
401
- }
402
-
403
- function scoreFixtureRun(run: SearchBenchRun, expected: readonly SearchFixtureExpectation[]): number {
404
- const activePatterns = new Set(run.patterns.map((pattern) => pattern as string));
405
- const activeExpected = expected.filter((item) => activePatterns.has(item.patternId));
406
- let score = run.skipped && activeExpected.some((item) => item.shouldMatch) ? -3 : 0;
407
- const matchCounts = countMatches(run.matches);
408
- const expectedPatterns = new Set(activeExpected.map((item) => item.patternId));
409
- for (const item of activeExpected) {
410
- const matched = (matchCounts[item.patternId] ?? 0) > 0;
411
- if (item.shouldMatch && matched) score += 5;
412
- if (item.shouldMatch && !matched) score -= 4;
413
- if (!item.shouldMatch && !matched) score += 2;
414
- if (!item.shouldMatch && matched) score -= 10;
415
- }
416
- for (const match of run.matches) {
417
- const id = match.patternId as string;
418
- if (!expectedPatterns.has(id)) score -= 6;
419
- }
420
- score -= (run.elapsedMs / 1000) * 0.05;
421
- score -= (run.inputTokens / 1000) * 0.001;
422
- return round(score);
423
- }
424
-
425
- function scoreSmokeRun(run: SearchBenchRun): number {
426
- let score = 0;
427
- if (run.skipped) score -= 5;
428
- if (run.matches.length > 3) score -= 3;
429
- if (run.elapsedMs > 60_000) score -= 5;
430
- if (run.inputTokens > 12_000 && run.skipped) score -= 5;
431
- score -= (run.elapsedMs / 1000) * 0.05;
432
- score -= (run.inputTokens / 1000) * 0.001;
433
- return round(score);
434
- }
435
-
436
- function summarize(
437
- profiles: readonly SearchProfile[],
438
- runs: readonly SearchBenchRun[],
439
- ): readonly ProfileResult[] {
440
- const rows = profiles.map((profile) => {
441
- const fixtureRuns = runs.filter((run) => run.profileId === profile.id && run.fixtureId);
442
- const smokeRuns = runs.filter((run) => run.profileId === profile.id && run.smokeId);
443
- const counts = fixtureRuns.reduce((acc, run) => addFixtureCounts(acc, run), emptyCounts());
444
- const positiveFixtureCount = fixtureRuns
445
- .flatMap((run) => (run.expected ?? []).filter((item) => run.patterns.some((pattern) => pattern === item.patternId)))
446
- .filter((expected) => expected.shouldMatch).length;
447
- const avgMs = fixtureRuns.length === 0
448
- ? 0
449
- : fixtureRuns.reduce((sum, run) => sum + run.elapsedMs, 0) / fixtureRuns.length;
450
- const decision = decisionForProfile(counts, positiveFixtureCount, smokeRuns);
451
- return {
452
- profileId: profile.id,
453
- fixtureScore: round(fixtureRuns.reduce((sum, run) => sum + (run.score ?? 0), 0)),
454
- falsePositives: counts.fp,
455
- falseNegatives: counts.fn,
456
- truePositives: counts.tp,
457
- trueNegatives: counts.tn,
458
- wrongPatterns: counts.wp,
459
- assignedCheckFalsePositives: counts.assignedFp,
460
- avgMs: Math.round(avgMs),
461
- smokeMatches: smokeRuns.reduce((sum, run) => sum + run.matches.length, 0),
462
- smokeSkipped: smokeRuns.filter((run) => run.skipped).length,
463
- matchesUsingCounterReasonAsProof: fixtureRuns.reduce((sum, run) => sum + run.matchesUsingCounterReasonAsProof, 0),
464
- decision,
465
- };
466
- });
467
- return rows.sort((a, b) => b.fixtureScore - a.fixtureScore);
468
- }
469
-
470
- function summarizeByCheck(runs: readonly SearchBenchRun[]): readonly CheckResult[] {
471
- const counts = new Map<string, ReturnType<typeof emptyCounts>>();
472
- for (const run of runs.filter((item) => item.fixtureId)) {
473
- const expected = run.expected ?? [];
474
- const activePatterns = new Set(run.patterns.map((pattern) => pattern as string));
475
- const activeExpected = expected.filter((item) => activePatterns.has(item.patternId));
476
- for (const item of activeExpected) {
477
- const current = counts.get(item.patternId) ?? emptyCounts();
478
- const matched = run.matches.some((match) => match.patternId === item.patternId);
479
- if (item.shouldMatch && matched) current.tp += 1;
480
- if (item.shouldMatch && !matched) current.fn += 1;
481
- if (!item.shouldMatch && matched) {
482
- current.fp += 1;
483
- current.assignedFp += 1;
484
- }
485
- if (!item.shouldMatch && !matched) current.tn += 1;
486
- counts.set(item.patternId, current);
487
- }
488
- const expectedPatterns = new Set(activeExpected.map((item) => item.patternId));
489
- for (const match of run.matches) {
490
- const id = match.patternId as string;
491
- if (expectedPatterns.has(id)) continue;
492
- const current = counts.get(id) ?? emptyCounts();
493
- current.fp += 1;
494
- current.wp += 1;
495
- counts.set(id, current);
496
- }
497
- }
498
- return [...counts.entries()]
499
- .map(([checkId, count]) => ({
500
- checkId,
501
- truePositives: count.tp,
502
- falsePositives: count.fp,
503
- falseNegatives: count.fn,
504
- wrongPatterns: count.wp,
505
- assignedCheckFalsePositives: count.assignedFp,
506
- decision: checkDecision(count),
507
- }))
508
- .sort((a, b) => a.checkId.localeCompare(b.checkId));
509
- }
510
-
511
- function addFixtureCounts(counts: ReturnType<typeof emptyCounts>, run: SearchBenchRun): ReturnType<typeof emptyCounts> {
512
- const expected = run.expected ?? [];
513
- const activePatterns = new Set(run.patterns.map((pattern) => pattern as string));
514
- const activeExpected = expected.filter((item) => activePatterns.has(item.patternId));
515
- const matchCounts = countMatches(run.matches);
516
- const expectedPatterns = new Set(activeExpected.map((item) => item.patternId));
517
- for (const item of activeExpected) {
518
- const matched = (matchCounts[item.patternId] ?? 0) > 0;
519
- if (item.shouldMatch && matched) counts.tp += 1;
520
- if (item.shouldMatch && !matched) counts.fn += 1;
521
- if (!item.shouldMatch && !matched) counts.tn += 1;
522
- if (!item.shouldMatch && matched) {
523
- counts.fp += 1;
524
- counts.assignedFp += 1;
525
- }
526
- }
527
- for (const match of run.matches) {
528
- const id = match.patternId as string;
529
- if (!expectedPatterns.has(id)) {
530
- if (!expectedPatterns.has(id)) counts.fp += 1;
531
- counts.wp += 1;
532
- }
533
- }
534
- return counts;
535
- }
536
-
537
- function emptyCounts() {
538
- return { tp: 0, tn: 0, fp: 0, fn: 0, wp: 0, assignedFp: 0 };
539
- }
540
-
541
- function decisionForProfile(
542
- counts: ReturnType<typeof emptyCounts>,
543
- positiveFixtureCount: number,
544
- smokeRuns: readonly SearchBenchRun[],
545
- ): string {
546
- if (counts.fp > 0) return "reject: false positives";
547
- if (counts.wp > 0) return "reject: wrong pattern";
548
- if (counts.tp < Math.ceil(positiveFixtureCount * 0.6)) return "reject: low recall";
549
- if (smokeRuns.some((run) => run.matches.length > 3)) return "reject: noisy smoke";
550
- if (smokeRuns.some((run) => run.elapsedMs > 60_000)) return "reject: slow smoke";
551
- if (smokeRuns.some((run) => run.skipped)) return "fixture candidate";
552
- return "candidate hook default";
553
- }
554
-
555
- function checkDecision(counts: ReturnType<typeof emptyCounts>): string {
556
- if (counts.fp > 0) return "not search-safe";
557
- if (counts.tp === 0 && counts.fn > 0) return "blind";
558
- if (counts.fn > counts.tp) return "low recall";
559
- return "candidate";
560
- }
561
-
562
- function countMatches(matches: readonly SearchMatch[]): Record<string, number> {
563
- const counts: Record<string, number> = {};
564
- for (const match of matches) counts[match.patternId] = (counts[match.patternId] ?? 0) + 1;
565
- return counts;
566
- }
567
-
568
- function countCounterReasonProofs(matches: readonly SearchMatch[]): number {
569
- return matches.filter((match) => /counter_reason/i.test(match.proof)).length;
570
- }
571
-
572
- function emptyReplayOutcomeCounts() {
573
- return {
574
- runs: 0,
575
- no_candidates: 0,
576
- ran_no_matches: 0,
577
- ran_with_matches: 0,
578
- skipped_input_too_large: 0,
579
- error: 0,
580
- matches: 0,
581
- modelCalls: 0,
582
- targets: 0,
583
- };
584
- }
585
-
586
- async function writeRunFiles(
587
- runsDir: string,
588
- id: string,
589
- run: SearchBenchRun,
590
- description: string,
591
- ): Promise<void> {
592
- const safeId = safeSegment(id);
593
- await writeFile(path.join(runsDir, `${safeId}.json`), JSON.stringify(run, null, 2));
594
- await writeFile(path.join(runsDir, `${safeId}.md`), renderRunMarkdown(run, description));
595
- }
596
-
597
- function renderRunMarkdown(run: SearchBenchRun, description: string): string {
598
- return `# ${run.fixtureId ?? run.smokeId}
599
-
600
- Profile: ${run.profileId}
601
- Description: ${description}
602
- Runtime: ${run.elapsedMs}ms
603
- Targets: ${run.targets}
604
- Model calls: ${run.modelCalls}
605
- Input tokens: ${run.inputTokens}
606
- Counter-reason proofs: ${run.matchesUsingCounterReasonAsProof}
607
- Skipped: ${run.skipped ? `${run.skipReason ?? "yes"}` : "no"}
608
- Score: ${run.score ?? "n/a"}
609
-
610
- ## Matches
611
- ${run.matches.length === 0 ? "(none)" : run.matches.map((match, index) => `${index + 1}. ${match.patternId} (${match.targetId})
612
- reason: ${match.reason}
613
- proof: ${match.proof}`).join("\n")}
614
-
615
- ## Expected
616
- ${(run.expected ?? []).length === 0 ? "(none)" : (run.expected ?? []).map((expected) => `- ${expected.patternId}: ${expected.shouldMatch ? "match" : "no match"}`).join("\n")}
617
-
618
- ## Targets
619
- ${run.targetsPreview.length === 0 ? "(none)" : run.targetsPreview.map((target) => `- ${target.targetId}: ${target.patternId} ${target.entityKind ?? ""} ${target.sourceKind ?? ""}`.trim()).join("\n")}
620
-
621
- ${run.error ? `## Error\n${run.error}\n` : ""}`;
622
- }
623
-
624
- function renderLeaderboard(rows: readonly ProfileResult[], perCheck: readonly CheckResult[]): string {
625
- const table = rows.map((row, index) =>
626
- `| ${index + 1} | ${row.profileId} | ${row.fixtureScore} | ${row.falsePositives} | ${row.wrongPatterns} | ${row.assignedCheckFalsePositives} | ${row.falseNegatives} | ${row.truePositives} | ${row.matchesUsingCounterReasonAsProof} | ${row.avgMs} | ${row.smokeMatches} | ${row.smokeSkipped} | ${row.decision} |`
627
- ).join("\n");
628
- const checkTable = perCheck.map((row) =>
629
- `| ${row.checkId} | ${row.truePositives} | ${row.falsePositives} | ${row.wrongPatterns} | ${row.assignedCheckFalsePositives} | ${row.falseNegatives} | ${row.decision} |`
630
- ).join("\n");
631
- return `# Search Bench Leaderboard
632
-
633
- | rank | profile | fixture score | FP | wrong FP | assigned FP | FN | TP | counter-proof | avg ms | smoke matches | smoke skipped | decision |
634
- |---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|
635
- ${table}
636
-
637
- ## Per-Check Summary
638
-
639
- | check | TP | FP | wrong FP | assigned FP | FN | decision |
640
- |---|---:|---:|---:|---:|---:|---|
641
- ${checkTable}
642
- `;
643
- }
644
-
645
- function renderReplayMarkdown(runs: readonly SearchBenchReplayRun[]): string {
646
- const table = runs.map((run) => {
647
- const patterns = Object.entries(run.matchesByPattern)
648
- .filter(([, count]) => count > 0)
649
- .map(([pattern, count]) => count === 1 ? pattern : `${pattern}(${count})`)
650
- .join(", ") || "-";
651
- return `| ${run.profileId} | ${run.commitId} | ${run.changedFiles} | +${run.addedLines}/-${run.deletedLines} | ${run.outcome} | ${run.elapsedMs} | ${run.targets} | ${run.inputTokens} | ${run.matches.length} | ${patterns} | |`;
652
- }).join("\n");
653
- const outcomeTable = renderReplayOutcomeSummary(runs);
654
- return `# Real Staged Replay
655
-
656
- ${outcomeTable}
657
-
658
- | profile | commit | files | +/- | outcome | ms | targets | input tokens | matches | patterns | manual |
659
- |---|---|---:|---:|---|---:|---:|---:|---:|---|---|
660
- ${table}
661
- `;
662
- }
663
-
664
- function renderReplayReviewMarkdown(runs: readonly SearchBenchReplayRun[]): string {
665
- const matched = runs.filter((run) => run.matches.length > 0);
666
- if (matched.length === 0) return "# Real Replay Review\n\nNo real replay matches.\n";
667
- return `# Real Replay Review
668
-
669
- ${matched.flatMap((run) => run.matches.map((match) => `## ${run.profileId} / ${run.commitId}
670
-
671
- Pattern: ${match.patternId}
672
- Target: ${match.targetId}
673
- Reason: ${match.reason}
674
- Proof: ${match.proof}
675
- Manual label: [good / maybe / bad]
676
- Notes:
677
- `)).join("\n")}`;
678
- }
679
-
680
- function renderReplayOutcomeSummary(runs: readonly SearchBenchReplayRun[]): string {
681
- const byProfile = new Map<string, ReturnType<typeof emptyReplayOutcomeCounts>>();
682
- for (const run of runs) {
683
- const current = byProfile.get(run.profileId) ?? emptyReplayOutcomeCounts();
684
- current.runs += 1;
685
- current[run.outcome] += 1;
686
- current.matches += run.matches.length;
687
- current.modelCalls += run.modelCalls;
688
- current.targets += run.targets;
689
- byProfile.set(run.profileId, current);
690
- }
691
- const table = [...byProfile.entries()].map(([profile, counts]) =>
692
- `| ${profile} | ${counts.runs} | ${counts.no_candidates} | ${counts.ran_no_matches} | ${counts.ran_with_matches} | ${counts.skipped_input_too_large} | ${counts.error} | ${counts.matches} | ${counts.modelCalls} | ${counts.targets} |`
693
- ).join("\n");
694
- return `## Outcome Summary
695
-
696
- | profile | runs | no candidates | ran no matches | ran with matches | input too large | errors | matches | model calls | targets |
697
- |---|---:|---:|---:|---:|---:|---:|---:|---:|---:|
698
- ${table}`;
699
- }
700
-
701
- async function resolveProfilePaths(profilePaths: readonly string[], configDir: string): Promise<readonly string[]> {
702
- return Promise.all(profilePaths.map((profilePath) => resolvePath(profilePath, configDir)));
703
- }
704
-
705
- async function resolveGlob(pattern: string, configDir: string): Promise<readonly string[]> {
706
- const resolved = await resolvePath(pattern, configDir, false);
707
- if (!resolved.includes("*")) return [resolved];
708
- const before = resolved.slice(0, resolved.indexOf("*"));
709
- const after = resolved.slice(resolved.indexOf("*") + 1);
710
- const dir = before.endsWith(path.sep) ? before.slice(0, -1) : path.dirname(before);
711
- const prefix = before.endsWith(path.sep) ? "" : path.basename(before);
712
- const entries = await readdir(dir);
713
- return entries
714
- .filter((entry) => entry.startsWith(prefix) && entry.endsWith(after))
715
- .map((entry) => path.join(dir, entry))
716
- .sort();
717
- }
718
-
719
- async function resolvePath(input: string, configDir: string, mustExist = true): Promise<string> {
720
- const expanded = input.startsWith("~/") ? path.join(process.env.HOME ?? "", input.slice(2)) : input;
721
- const fromCwd = path.resolve(expanded);
722
- const fromConfig = path.resolve(configDir, expanded);
723
- if (!mustExist || await exists(fromCwd)) return fromCwd;
724
- if (await exists(fromConfig)) return fromConfig;
725
- return fromCwd;
726
- }
727
-
728
- async function readProfile(filePath: string): Promise<Readonly<{ filePath: string; profile: SearchProfile }>> {
729
- const profile = JSON.parse(await readFile(filePath, "utf8")) as SearchProfile;
730
- if (!profile.id) throw new Error(`Search profile missing id: ${filePath}`);
731
- return { filePath, profile };
732
- }
733
-
734
- async function readFixture(filePath: string): Promise<Readonly<{ filePath: string; fixture: SearchFixture }>> {
735
- const fixture = JSON.parse(await readFile(filePath, "utf8")) as SearchFixture;
736
- if (!fixture.id) throw new Error(`Search fixture missing id: ${filePath}`);
737
- return { filePath, fixture };
738
- }
739
-
740
- async function exists(filePath: string): Promise<boolean> {
741
- try {
742
- await readFile(filePath);
743
- return true;
744
- } catch {
745
- return false;
746
- }
747
- }
748
-
749
- function resolveSmokeCwd(cwd: string | undefined): string | null {
750
- if (!cwd) return process.cwd();
751
- if (cwd === "$BEVYL_REPO") return process.env.BEVYL_REPO ?? null;
752
- if (cwd.startsWith("$BEVYL_REPO/")) {
753
- const root = process.env.BEVYL_REPO;
754
- return root ? path.join(root, cwd.slice("$BEVYL_REPO/".length)) : null;
755
- }
756
- return cwd.startsWith("~/") ? path.join(process.env.HOME ?? "", cwd.slice(2)) : cwd;
757
- }
758
-
759
- function resolveReplayCwd(replay: SearchBenchCommitReplay): string | null {
760
- if (replay.cwd) return expandPath(replay.cwd);
761
- if (replay.repoEnv) {
762
- const value = process.env[replay.repoEnv];
763
- return value ? expandPath(value) : null;
764
- }
765
- return process.cwd();
766
- }
767
-
768
- function expandPath(input: string): string {
769
- return input.startsWith("~/") ? path.join(process.env.HOME ?? "", input.slice(2)) : input;
770
- }
771
-
772
- function safeSegment(value: string): string {
773
- return value.replace(/[^A-Za-z0-9._-]+/g, "_").replace(/^_+|_+$/g, "") || "run";
774
- }
775
-
776
- function round(value: number): number {
777
- return Math.round(value * 1000) / 1000;
778
- }
779
-
780
- function numericStat(value: string | undefined): number {
781
- const parsed = Number(value);
782
- return Number.isFinite(parsed) ? parsed : 0;
783
- }