@dreki-gg/pi-code-reviewer 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,7 +28,16 @@ export class ExecError extends Data.TaggedError('ExecError')<{
28
28
  }
29
29
  }
30
30
 
31
- export type CodeReviewerError = FileReadError | ExecError;
31
+ export class ModelError extends Data.TaggedError('ModelError')<{
32
+ readonly stage: string;
33
+ readonly cause: unknown;
34
+ }> {
35
+ get message(): string {
36
+ return `Model call failed during ${this.stage}: ${causeMessage(this.cause)}`;
37
+ }
38
+ }
39
+
40
+ export type CodeReviewerError = FileReadError | ExecError | ModelError;
32
41
 
33
42
  // ── Helpers ───────────────────────────────────────────────────────────────
34
43
 
@@ -0,0 +1,84 @@
1
+ /**
2
+ * Resolve the per-step model plan for a review run.
3
+ *
4
+ * The pipeline can run each step on a different model AND reasoning level so you
5
+ * can A/B which models / efforts review best / fastest / cheapest. Config
6
+ * supplies steps as a spec string ("provider/id", a bare id, or a display name)
7
+ * or `{ model, reasoning }`. This module turns them into a {@link ModelResolution}
8
+ * (key → real model) plus a {@link ModelPlan} (which model + reasoning each pass
9
+ * and the validator use). Unresolvable specs degrade to the session model with a
10
+ * warning, so a typo never fails the review.
11
+ */
12
+
13
+ import type { Api, Model } from '@earendil-works/pi-ai';
14
+
15
+ import { DEFAULT_MODEL_KEY, type ModelResolution, resolveModelSpec } from './effects/model';
16
+ import type { ModelAssignment, ModelPlan, ModelStepConfig, ReviewPipelineConfig } from './types';
17
+
18
+ export type ResolvedModelPlan = {
19
+ resolution: ModelResolution;
20
+ plan: ModelPlan;
21
+ warnings: string[];
22
+ };
23
+
24
+ function stepParts(step: ModelStepConfig | undefined): {
25
+ spec?: string;
26
+ reasoning?: ModelAssignment['reasoning'];
27
+ } {
28
+ if (step === undefined) return {};
29
+ if (typeof step === 'string') return { spec: step };
30
+ return { spec: step.model, reasoning: step.reasoning };
31
+ }
32
+
33
+ function labelFor(key: string, reasoning?: ModelAssignment['reasoning']): string {
34
+ return reasoning ? `${key} (${reasoning})` : key;
35
+ }
36
+
37
+ export function resolveModelPlan(
38
+ review: ReviewPipelineConfig,
39
+ defaultModel: Model<Api>,
40
+ registry: { getAll: () => Model<Api>[] },
41
+ ): ResolvedModelPlan {
42
+ const byKey = new Map<string, Model<Api>>();
43
+ const warnings: string[] = [];
44
+
45
+ // Resolve one step to an assignment; cache resolved models so a spec
46
+ // referenced by several passes only resolves once.
47
+ const assign = (step: ModelStepConfig | undefined): ModelAssignment => {
48
+ const { spec, reasoning } = stepParts(step);
49
+ if (!spec || !spec.trim()) {
50
+ return { key: DEFAULT_MODEL_KEY, label: labelFor(DEFAULT_MODEL_KEY, reasoning), reasoning };
51
+ }
52
+ const key = spec.trim();
53
+ if (key !== DEFAULT_MODEL_KEY && !byKey.has(key)) {
54
+ const model = resolveModelSpec(registry, key);
55
+ if (!model) {
56
+ warnings.push(`review model "${key}" not found — using the session model for those steps`);
57
+ return { key: DEFAULT_MODEL_KEY, label: labelFor(DEFAULT_MODEL_KEY, reasoning), reasoning };
58
+ }
59
+ byKey.set(key, model);
60
+ }
61
+ return { key, label: labelFor(key, reasoning), reasoning };
62
+ };
63
+
64
+ // passModels (rotated round-robin) overrides passModel overrides default.
65
+ const passSteps =
66
+ review.passModels && review.passModels.length > 0 ? review.passModels : [review.passModel];
67
+
68
+ const passes: ModelAssignment[] = [];
69
+ for (let index = 0; index < review.passes; index += 1) {
70
+ passes.push(assign(passSteps[index % passSteps.length]));
71
+ }
72
+ const validator = assign(review.validateModel);
73
+
74
+ return { resolution: { defaultModel, byKey }, plan: { passes, validator }, warnings };
75
+ }
76
+
77
+ /** Default plan: every step on the session model. Used as a fallback and in tests. */
78
+ export function defaultModelPlan(passes: number): ModelPlan {
79
+ const step: ModelAssignment = { key: DEFAULT_MODEL_KEY, label: DEFAULT_MODEL_KEY };
80
+ return {
81
+ passes: Array.from({ length: passes }, () => ({ ...step })),
82
+ validator: { ...step },
83
+ };
84
+ }
@@ -0,0 +1,571 @@
1
+ /**
2
+ * Self-driving review pipeline (Bugbot-style).
3
+ *
4
+ * Instead of returning a prompt for a single downstream pass, the tool can run
5
+ * the review itself: fan out several ADVERSARIAL bug-finding passes over the
6
+ * session's model, bucket near-duplicate findings, keep the ones multiple
7
+ * passes independently surface (majority vote), then run a VALIDATOR pass that
8
+ * tries to falsify each survivor. This mirrors the structure Cursor describes
9
+ * for Bugbot (parallel passes with varied reasoning → bucket → vote → validate)
10
+ * and exists specifically to catch the class of bug a single checklist pass
11
+ * misses — e.g. `typeof NaN === 'number'` slipping a boundary guard.
12
+ *
13
+ * Everything here is pure over the {@link Reviewer} service so it is unit
14
+ * testable with a deterministic fake model.
15
+ */
16
+
17
+ import { Effect } from 'effect';
18
+
19
+ import { type ModelResolution, Reviewer, makeReviewerService } from './effects/model';
20
+ import type {
21
+ CandidateFinding,
22
+ LensSeverity,
23
+ ModelPlan,
24
+ PipelineResult,
25
+ PipelineTelemetry,
26
+ RawFinding,
27
+ ReviewPipelineConfig,
28
+ ValidatedFinding,
29
+ } from './types';
30
+
31
+ const SEVERITY_RANK: Record<LensSeverity, number> = { blocker: 3, warning: 2, note: 1 };
32
+ const VALID_SEVERITIES = new Set<LensSeverity>(['blocker', 'warning', 'note']);
33
+
34
+ /** Adversarial system prompt shared by every bug-finding pass. Intentionally
35
+ * aggressive: Bugbot found that cautious prompts under-report, and a separate
36
+ * validator stage is cheaper than missed bugs. */
37
+ const PASS_SYSTEM_PROMPT = [
38
+ 'You are an aggressive, adversarial code reviewer hunting for REAL bugs in a diff —',
39
+ 'logic errors, data loss/corruption, security holes, and correctness defects. Not style, not nits.',
40
+ '',
41
+ 'Method — be suspicious of EVERY changed line:',
42
+ '- For each changed function, enumerate adversarial inputs and ask what breaks:',
43
+ ' null / undefined / NaN / Infinity / -0 / "" / [] / {} / huge / negative / duplicate / out-of-order / unicode.',
44
+ '- Audit every comment and test claim against the ACTUAL code. If a comment says "non-numeric falls through",',
45
+ ' construct the exact input that proves it does NOT (e.g. `typeof NaN === "number"` defeats a typeof guard).',
46
+ '- Hunt specifically for: off-by-one; wrong id/key space; missing await / unhandled rejection; swallowed errors;',
47
+ ' race conditions & cancellation; type-narrowing escapes & unsafe casts; boundary/edge conditions;',
48
+ ' lost writes (in-memory/UI updated but never durably persisted); injection / path-traversal / zip-slip;',
49
+ ' resource leaks & unbounded loops; contract drift between a producer and its consumer.',
50
+ '- Use the provided lens definitions as project-specific invariants to check.',
51
+ '',
52
+ 'Prefer flagging a suspicious pattern over staying silent — a validator filters false positives later.',
53
+ 'Report a bug at the precise file and line it occurs.',
54
+ '',
55
+ 'Output ONLY a JSON array, no prose, no markdown fences:',
56
+ '[{ "file": "path", "line": 42, "severity": "blocker|warning|note", "category": "short-tag", "message": "what + why + the triggering input" }]',
57
+ ].join('\n');
58
+
59
+ /** Per-pass focus seeds. Giving each pass a different lens of suspicion (and a
60
+ * temperature jitter) diversifies reasoning the way Bugbot's randomized diff
61
+ * ordering does, so passes don't collapse onto identical findings. */
62
+ export const PASS_FOCUSES = [
63
+ 'TRUST BOUNDARIES & INPUT VALIDATION: every value crossing an external/disk/wire/user boundary; every decode/parse; the edge inputs (null/undefined/NaN/Infinity/-0/empty/huge/negative). Numeric-type guards that `NaN`/`Infinity` defeat.',
64
+ 'CONTROL FLOW & BRANCHES: every new conditional, guard, early return, and switch — find the missed case, the inverted condition, and the off-by-one.',
65
+ 'ASYNC LIFECYCLE & CONCURRENCY: await ordering, missing await, unhandled rejection, fire-and-forget, races, cancellation/abort handling, stale writes after unmount/navigation.',
66
+ 'TYPES & INVARIANTS: type-narrowing escapes, unsafe casts, non-null assertions on absent values, non-exhaustive unions, and any comment/test claim that the code does not actually honor.',
67
+ 'STATE & DATA INTEGRITY: in-memory or UI mutation with no matching durable write (lost on reload), wrong id/key space in a lookup, a projection clobbering the source of truth, read-before-write ordering.',
68
+ 'ERROR HANDLING & SECURITY: swallowed/empty catches, leaked secrets, injection, path traversal / zip-slip, trusting unsanitized external data, missing validation before a side effect.',
69
+ 'RESOURCE & PERFORMANCE: unbounded loops/polls, N+1 IO, memory leaks, missing cleanup of timers/listeners/streams.',
70
+ 'CONTRACT & COMPATIBILITY: signature/shape drift, breaking changes to a wire/format contract, mismatched assumptions between a producer and its consumer, version negotiation gaps.',
71
+ ] as const;
72
+
73
+ const VALIDATOR_SYSTEM_PROMPT = [
74
+ 'You are a STRICT bug validator. You receive a diff and a numbered list of candidate findings from several reviewers.',
75
+ 'For EACH candidate decide: is it a REAL bug actually present in / introduced by this diff, or a FALSE POSITIVE?',
76
+ '',
77
+ 'Rules:',
78
+ '- To mark "real" you must be able to name the concrete input or execution path that triggers it, grounded in the shown code.',
79
+ '- Mark "false-positive" for speculation unsupported by the diff, style nitpicks, behavior already handled by the shown code, or duplicates of another candidate.',
80
+ '- Be conservative: if you cannot substantiate a candidate from the diff, it is a false positive.',
81
+ '- Keep justification to one or two sentences naming the trigger (for real) or the reason it cannot occur (for false-positive).',
82
+ '',
83
+ 'Output ONLY a JSON array, no prose, no fences:',
84
+ '[{ "id": 0, "verdict": "real|false-positive", "confidence": 0.0, "justification": "..." }]',
85
+ ].join('\n');
86
+
87
+ const STOPWORDS = new Set([
88
+ 'the',
89
+ 'and',
90
+ 'for',
91
+ 'with',
92
+ 'that',
93
+ 'this',
94
+ 'when',
95
+ 'from',
96
+ 'into',
97
+ 'will',
98
+ 'would',
99
+ 'could',
100
+ 'should',
101
+ 'have',
102
+ 'has',
103
+ 'not',
104
+ 'but',
105
+ 'are',
106
+ 'was',
107
+ 'were',
108
+ 'its',
109
+ 'his',
110
+ 'her',
111
+ 'than',
112
+ 'then',
113
+ 'which',
114
+ 'what',
115
+ 'where',
116
+ 'while',
117
+ 'use',
118
+ 'used',
119
+ 'using',
120
+ 'can',
121
+ 'may',
122
+ 'might',
123
+ 'a',
124
+ 'an',
125
+ 'is',
126
+ 'of',
127
+ 'to',
128
+ 'in',
129
+ 'on',
130
+ 'it',
131
+ 'be',
132
+ 'as',
133
+ 'at',
134
+ 'or',
135
+ 'if',
136
+ 'so',
137
+ ]);
138
+
139
+ /** Tokenize a finding message for similarity comparison. */
140
+ function tokenize(message: string): Set<string> {
141
+ const tokens = message
142
+ .toLowerCase()
143
+ .replace(/[^a-z0-9]+/g, ' ')
144
+ .split(' ')
145
+ .filter((token) => token.length > 2 && !STOPWORDS.has(token));
146
+ return new Set(tokens);
147
+ }
148
+
149
+ function jaccard(left: Set<string>, right: Set<string>): number {
150
+ if (left.size === 0 && right.size === 0) return 1;
151
+ let intersection = 0;
152
+ for (const token of left) if (right.has(token)) intersection += 1;
153
+ const union = left.size + right.size - intersection;
154
+ return union === 0 ? 0 : intersection / union;
155
+ }
156
+
157
+ /** Two findings are "the same bug" when they touch the same file and either sit
158
+ * within a few lines (a strong co-location signal, so only a MODEST text
159
+ * overlap is needed to fuse paraphrases) or — when a line is missing — read
160
+ * clearly similar. The lower co-located bar matters: independent passes word
161
+ * the same defect very differently, and Bugbot leans on an LLM to merge them;
162
+ * co-location is our deterministic stand-in for that judgment. */
163
+ function sameBug(
164
+ candidate: { file: string; line?: number; tokens: Set<string> },
165
+ bucket: { file: string; line?: number; tokens: Set<string> },
166
+ ): boolean {
167
+ if (candidate.file !== bucket.file) return false;
168
+ const similarity = jaccard(candidate.tokens, bucket.tokens);
169
+ if (candidate.line !== undefined && bucket.line !== undefined) {
170
+ if (Math.abs(candidate.line - bucket.line) > 3) return false;
171
+ return similarity >= 0.25;
172
+ }
173
+ // One side has no line to anchor on — demand a clearer textual match.
174
+ return similarity >= 0.5;
175
+ }
176
+
177
+ type WorkingBucket = {
178
+ file: string;
179
+ line?: number;
180
+ tokens: Set<string>;
181
+ severities: LensSeverity[];
182
+ messages: string[];
183
+ categories: (string | undefined)[];
184
+ passIndices: Set<number>;
185
+ };
186
+
187
+ /** Extract the first balanced top-level JSON array from arbitrary model text,
188
+ * tolerating prose or ```json fences around it. */
189
+ export function extractJsonArray(text: string): string | null {
190
+ const fenced = text.match(/```(?:json)?\s*([\s\S]*?)```/i);
191
+ const haystack = fenced ? fenced[1] : text;
192
+ const start = haystack.indexOf('[');
193
+ if (start === -1) return null;
194
+
195
+ let depth = 0;
196
+ let inString = false;
197
+ let escaped = false;
198
+ for (let index = start; index < haystack.length; index += 1) {
199
+ const char = haystack[index];
200
+ if (inString) {
201
+ if (escaped) escaped = false;
202
+ else if (char === '\\') escaped = true;
203
+ else if (char === '"') inString = false;
204
+ continue;
205
+ }
206
+ if (char === '"') inString = true;
207
+ else if (char === '[') depth += 1;
208
+ else if (char === ']') {
209
+ depth -= 1;
210
+ if (depth === 0) return haystack.slice(start, index + 1);
211
+ }
212
+ }
213
+ return null;
214
+ }
215
+
216
+ function coerceSeverity(value: unknown): LensSeverity | null {
217
+ return typeof value === 'string' && VALID_SEVERITIES.has(value as LensSeverity)
218
+ ? (value as LensSeverity)
219
+ : null;
220
+ }
221
+
222
+ /** Parse one pass's raw text into validated RawFindings, dropping junk. */
223
+ export function parseFindings(text: string): RawFinding[] {
224
+ const json = extractJsonArray(text);
225
+ if (!json) return [];
226
+ let parsed: unknown;
227
+ try {
228
+ parsed = JSON.parse(json);
229
+ } catch {
230
+ return [];
231
+ }
232
+ if (!Array.isArray(parsed)) return [];
233
+
234
+ const findings: RawFinding[] = [];
235
+ for (const entry of parsed) {
236
+ if (typeof entry !== 'object' || entry === null) continue;
237
+ const record = entry as Record<string, unknown>;
238
+ const severity = coerceSeverity(record.severity);
239
+ const file = typeof record.file === 'string' ? record.file.trim() : '';
240
+ const message = typeof record.message === 'string' ? record.message.trim() : '';
241
+ if (!severity || !file || !message) continue;
242
+ const line =
243
+ typeof record.line === 'number' && Number.isInteger(record.line) && record.line > 0
244
+ ? record.line
245
+ : undefined;
246
+ const category = typeof record.category === 'string' ? record.category.trim() : undefined;
247
+ findings.push({ file, line, severity, message, category });
248
+ }
249
+ return findings;
250
+ }
251
+
252
+ /** Bucket near-duplicate findings across all passes, tracking distinct votes. */
253
+ export function bucketFindings(perPass: RawFinding[][]): CandidateFinding[] {
254
+ const buckets: WorkingBucket[] = [];
255
+
256
+ perPass.forEach((findings, passIndex) => {
257
+ for (const finding of findings) {
258
+ const tokens = tokenize(finding.message);
259
+ const match = buckets.find((bucket) => sameBug({ ...finding, tokens }, bucket));
260
+ if (match) {
261
+ match.severities.push(finding.severity);
262
+ match.messages.push(finding.message);
263
+ match.categories.push(finding.category);
264
+ match.passIndices.add(passIndex);
265
+ // Tighten the bucket line toward the most specific (defined) value.
266
+ if (match.line === undefined && finding.line !== undefined) match.line = finding.line;
267
+ for (const token of tokens) match.tokens.add(token);
268
+ } else {
269
+ buckets.push({
270
+ file: finding.file,
271
+ line: finding.line,
272
+ tokens,
273
+ severities: [finding.severity],
274
+ messages: [finding.message],
275
+ categories: [finding.category],
276
+ passIndices: new Set([passIndex]),
277
+ });
278
+ }
279
+ }
280
+ });
281
+
282
+ return buckets.map(mergeBucket);
283
+ }
284
+
285
+ /** Collapse a bucket to one representative finding: highest severity wins, the
286
+ * most detailed message survives, votes = distinct contributing passes. */
287
+ function mergeBucket(bucket: WorkingBucket): CandidateFinding {
288
+ const severity = bucket.severities.reduce((best, current) =>
289
+ SEVERITY_RANK[current] > SEVERITY_RANK[best] ? current : best,
290
+ );
291
+ const message = bucket.messages.reduce((best, current) =>
292
+ current.length > best.length ? current : best,
293
+ );
294
+ const category = mostCommon(bucket.categories.filter((value): value is string => Boolean(value)));
295
+ const passIndices = [...bucket.passIndices].sort((left, right) => left - right);
296
+ return {
297
+ file: bucket.file,
298
+ line: bucket.line,
299
+ severity,
300
+ message,
301
+ category,
302
+ votes: passIndices.length,
303
+ passIndices,
304
+ };
305
+ }
306
+
307
+ function mostCommon(values: string[]): string | undefined {
308
+ if (values.length === 0) return undefined;
309
+ const counts = new Map<string, number>();
310
+ for (const value of values) counts.set(value, (counts.get(value) ?? 0) + 1);
311
+ return [...counts.entries()].sort((left, right) => right[1] - left[1])[0][0];
312
+ }
313
+
314
+ /** Drop low-signal NOTE buckets (single-pass noise); always keep blockers and
315
+ * warnings so a genuine high-severity singleton still reaches the validator. */
316
+ export function selectCandidates(
317
+ candidates: CandidateFinding[],
318
+ config: Pick<ReviewPipelineConfig, 'minVotes'>,
319
+ ): { kept: CandidateFinding[]; droppedLowSignal: number } {
320
+ const kept = candidates.filter(
321
+ (candidate) => candidate.severity !== 'note' || candidate.votes >= config.minVotes,
322
+ );
323
+ return { kept, droppedLowSignal: candidates.length - kept.length };
324
+ }
325
+
326
+ function severitySort(left: ValidatedFinding, right: ValidatedFinding): number {
327
+ const bySeverity = SEVERITY_RANK[right.severity] - SEVERITY_RANK[left.severity];
328
+ if (bySeverity !== 0) return bySeverity;
329
+ if (right.votes !== left.votes) return right.votes - left.votes;
330
+ return right.confidence - left.confidence;
331
+ }
332
+
333
+ // ── Pipeline stages (Effect over the Reviewer service) ───────────────────────
334
+
335
+ function buildPassUser(basePrompt: string, focus: string): string {
336
+ return [
337
+ basePrompt,
338
+ '',
339
+ '---',
340
+ `PASS FOCUS (weight your attention here, but report any bug you see): ${focus}`,
341
+ '',
342
+ 'Return ONLY the JSON array of findings described in your instructions.',
343
+ ].join('\n');
344
+ }
345
+
346
+ /** Distinct model labels behind a finding's contributing passes (attribution). */
347
+ function contributingModels(passIndices: number[], plan: ModelPlan): string[] {
348
+ return [...new Set(passIndices.map((index) => plan.passes[index]?.label).filter(Boolean))];
349
+ }
350
+
351
+ /** Run N adversarial passes concurrently; a failed pass degrades to []. Each
352
+ * pass runs on the model + reasoning named by `plan.passes[passIndex]`. */
353
+ export function runPassesEffect(
354
+ basePrompt: string,
355
+ config: ReviewPipelineConfig,
356
+ plan: ModelPlan,
357
+ signal?: AbortSignal,
358
+ ): Effect.Effect<{ perPass: RawFinding[][]; failedPasses: number }, never, Reviewer> {
359
+ return Effect.gen(function* () {
360
+ const reviewer = yield* Reviewer;
361
+ const indices = Array.from({ length: config.passes }, (_unused, index) => index);
362
+
363
+ const outcomes = yield* Effect.forEach(
364
+ indices,
365
+ (passIndex) =>
366
+ Effect.gen(function* () {
367
+ const focus = PASS_FOCUSES[passIndex % PASS_FOCUSES.length];
368
+ const assignment = plan.passes[passIndex];
369
+ // Deterministic per-pass jitter so reruns are stable but passes differ.
370
+ const temperature = config.temperature + (passIndex % 4) * 0.1;
371
+ const result = yield* reviewer
372
+ .complete({
373
+ modelKey: assignment.key,
374
+ reasoning: assignment.reasoning,
375
+ system: PASS_SYSTEM_PROMPT,
376
+ user: buildPassUser(basePrompt, focus),
377
+ temperature,
378
+ stage: `pass-${passIndex + 1}`,
379
+ signal,
380
+ })
381
+ .pipe(Effect.either);
382
+ return result._tag === 'Right'
383
+ ? { findings: parseFindings(result.right), failed: false }
384
+ : { findings: [] as RawFinding[], failed: true };
385
+ }),
386
+ { concurrency: Math.max(1, config.concurrency) },
387
+ );
388
+
389
+ return {
390
+ perPass: outcomes.map((outcome) => outcome.findings),
391
+ failedPasses: outcomes.filter((outcome) => outcome.failed).length,
392
+ };
393
+ });
394
+ }
395
+
396
+ function buildValidatorUser(basePrompt: string, candidates: CandidateFinding[]): string {
397
+ const list = candidates
398
+ .map((candidate, index) => {
399
+ const where = candidate.line ? `${candidate.file}:${candidate.line}` : candidate.file;
400
+ return `[${index}] (${candidate.severity}, ${candidate.votes} votes) ${where} — ${candidate.message}`;
401
+ })
402
+ .join('\n');
403
+ return [
404
+ basePrompt,
405
+ '',
406
+ '---',
407
+ 'CANDIDATE FINDINGS TO VALIDATE:',
408
+ list,
409
+ '',
410
+ 'For each candidate id above, output the verdict JSON described in your instructions.',
411
+ ].join('\n');
412
+ }
413
+
414
+ type Verdict = {
415
+ id: number;
416
+ verdict: 'real' | 'false-positive';
417
+ confidence: number;
418
+ justification?: string;
419
+ };
420
+
421
+ function parseVerdicts(text: string): Map<number, Verdict> {
422
+ const json = extractJsonArray(text);
423
+ const verdicts = new Map<number, Verdict>();
424
+ if (!json) return verdicts;
425
+ let parsed: unknown;
426
+ try {
427
+ parsed = JSON.parse(json);
428
+ } catch {
429
+ return verdicts;
430
+ }
431
+ if (!Array.isArray(parsed)) return verdicts;
432
+ for (const entry of parsed) {
433
+ if (typeof entry !== 'object' || entry === null) continue;
434
+ const record = entry as Record<string, unknown>;
435
+ if (typeof record.id !== 'number' || !Number.isInteger(record.id)) continue;
436
+ const verdict = record.verdict === 'real' ? 'real' : 'false-positive';
437
+ const confidence =
438
+ typeof record.confidence === 'number' && Number.isFinite(record.confidence)
439
+ ? Math.min(1, Math.max(0, record.confidence))
440
+ : 0.5;
441
+ const justification =
442
+ typeof record.justification === 'string' ? record.justification.trim() : undefined;
443
+ verdicts.set(record.id, { id: record.id, verdict, confidence, justification });
444
+ }
445
+ return verdicts;
446
+ }
447
+
448
+ /** Validate every candidate in one batched call; survivors are verdict=real.
449
+ * A validator failure fails OPEN (keep candidates, unvalidated) so a flaky
450
+ * model never silently drops real bugs. */
451
+ export function validateCandidatesEffect(
452
+ basePrompt: string,
453
+ candidates: CandidateFinding[],
454
+ plan: ModelPlan,
455
+ signal?: AbortSignal,
456
+ ): Effect.Effect<{ findings: ValidatedFinding[]; droppedFalsePositives: number }, never, Reviewer> {
457
+ return Effect.gen(function* () {
458
+ if (candidates.length === 0) return { findings: [], droppedFalsePositives: 0 };
459
+ const reviewer = yield* Reviewer;
460
+
461
+ const result = yield* reviewer
462
+ .complete({
463
+ modelKey: plan.validator.key,
464
+ reasoning: plan.validator.reasoning,
465
+ system: VALIDATOR_SYSTEM_PROMPT,
466
+ user: buildValidatorUser(basePrompt, candidates),
467
+ temperature: 0,
468
+ stage: 'validate',
469
+ signal,
470
+ })
471
+ .pipe(Effect.either);
472
+
473
+ if (result._tag === 'Left') {
474
+ // Fail open: surface candidates unvalidated rather than lose them.
475
+ const findings = candidates.map((candidate) => ({
476
+ ...candidate,
477
+ verdict: 'real' as const,
478
+ confidence: 0.5,
479
+ justification: '(validator unavailable — surfaced unvalidated)',
480
+ models: contributingModels(candidate.passIndices, plan),
481
+ }));
482
+ return { findings, droppedFalsePositives: 0 };
483
+ }
484
+
485
+ const verdicts = parseVerdicts(result.right);
486
+ const findings: ValidatedFinding[] = [];
487
+ let droppedFalsePositives = 0;
488
+ candidates.forEach((candidate, index) => {
489
+ const verdict = verdicts.get(index);
490
+ // A candidate with no verdict returned is kept (fail open), not dropped.
491
+ if (verdict && verdict.verdict === 'false-positive') {
492
+ droppedFalsePositives += 1;
493
+ return;
494
+ }
495
+ findings.push({
496
+ ...candidate,
497
+ verdict: 'real',
498
+ confidence: verdict?.confidence ?? 0.5,
499
+ justification: verdict?.justification,
500
+ models: contributingModels(candidate.passIndices, plan),
501
+ });
502
+ });
503
+ return { findings, droppedFalsePositives };
504
+ });
505
+ }
506
+
507
+ /** Full pipeline: passes → bucket → vote → (validate) → ranked, capped result. */
508
+ export function runPipelineEffect(
509
+ basePrompt: string,
510
+ config: ReviewPipelineConfig,
511
+ plan: ModelPlan,
512
+ hooks: { onStage?: (stage: string) => void } = {},
513
+ signal?: AbortSignal,
514
+ ): Effect.Effect<PipelineResult, never, Reviewer> {
515
+ return Effect.gen(function* () {
516
+ hooks.onStage?.(`running ${config.passes} passes`);
517
+ const { perPass, failedPasses } = yield* runPassesEffect(basePrompt, config, plan, signal);
518
+
519
+ const buckets = bucketFindings(perPass);
520
+ const { kept, droppedLowSignal } = selectCandidates(buckets, config);
521
+
522
+ let validated: ValidatedFinding[];
523
+ let droppedFalsePositives = 0;
524
+ if (config.validate) {
525
+ hooks.onStage?.(`validating ${kept.length} candidates`);
526
+ const outcome = yield* validateCandidatesEffect(basePrompt, kept, plan, signal);
527
+ validated = outcome.findings;
528
+ droppedFalsePositives = outcome.droppedFalsePositives;
529
+ } else {
530
+ validated = kept.map((candidate) => ({
531
+ ...candidate,
532
+ verdict: 'real' as const,
533
+ confidence: Math.min(1, candidate.votes / Math.max(1, config.passes)),
534
+ models: contributingModels(candidate.passIndices, plan),
535
+ }));
536
+ }
537
+
538
+ validated.sort(severitySort);
539
+ const capped = validated.slice(0, config.maxFindings);
540
+
541
+ const telemetry: PipelineTelemetry = {
542
+ passes: config.passes,
543
+ passFindingCounts: perPass.map((findings) => findings.length),
544
+ buckets: buckets.length,
545
+ candidates: kept.length,
546
+ validated: capped.length,
547
+ droppedFalsePositives,
548
+ droppedLowSignal,
549
+ failedPasses,
550
+ passModels: plan.passes.map((assignment) => assignment.label),
551
+ validatorModel: plan.validator.label,
552
+ };
553
+ return { findings: capped, telemetry };
554
+ });
555
+ }
556
+
557
+ /** Promise wrapper: run the full pipeline against a resolved set of models. */
558
+ export function runPipeline(
559
+ resolution: ModelResolution,
560
+ plan: ModelPlan,
561
+ basePrompt: string,
562
+ config: ReviewPipelineConfig,
563
+ hooks: { onStage?: (stage: string) => void } = {},
564
+ signal?: AbortSignal,
565
+ ): Promise<PipelineResult> {
566
+ return Effect.runPromise(
567
+ runPipelineEffect(basePrompt, config, plan, hooks, signal).pipe(
568
+ Effect.provideService(Reviewer, makeReviewerService(resolution)),
569
+ ),
570
+ );
571
+ }