@dreki-gg/pi-code-reviewer 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +63 -1
- package/extensions/code-reviewer/commands/review-init.ts +5 -1
- package/extensions/code-reviewer/commands/review-tool.ts +85 -9
- package/extensions/code-reviewer/commands/review.ts +36 -1
- package/extensions/code-reviewer/config.ts +73 -1
- package/extensions/code-reviewer/diff.ts +93 -17
- package/extensions/code-reviewer/effects/model.ts +112 -0
- package/extensions/code-reviewer/errors.ts +10 -1
- package/extensions/code-reviewer/model-plan.ts +84 -0
- package/extensions/code-reviewer/passes.ts +592 -0
- package/extensions/code-reviewer/reviewer.ts +115 -1
- package/extensions/code-reviewer/types.ts +115 -0
- package/package.json +1 -1
- package/skills/code-review/lenses/code-quality.md +16 -2
|
@@ -4,7 +4,7 @@ import { Effect } from 'effect';
|
|
|
4
4
|
|
|
5
5
|
import type { DiffSource } from './diff';
|
|
6
6
|
import { Executor, makeExecutorService } from './effects/exec';
|
|
7
|
-
import type { LensConfig, LensResult } from './types';
|
|
7
|
+
import type { LensConfig, LensResult, PipelineResult, ValidatedFinding } from './types';
|
|
8
8
|
|
|
9
9
|
const isWindows = platform() === 'win32';
|
|
10
10
|
|
|
@@ -97,6 +97,120 @@ export function buildDiffSection(diff: DiffSource): string {
|
|
|
97
97
|
return parts.join('\n');
|
|
98
98
|
}
|
|
99
99
|
|
|
100
|
+
/**
|
|
101
|
+
* Build the shared review body fed to every pipeline pass: the diff (once) plus
|
|
102
|
+
* each lens definition + its tool outputs, WITHOUT the legacy per-lens output
|
|
103
|
+
* instructions (the pipeline supplies its own adversarial instructions). The
|
|
104
|
+
* legacy single-pass fallback appends its instructions separately.
|
|
105
|
+
*/
|
|
106
|
+
export function buildReviewBasePrompt(lensSections: string[], diff: DiffSource): string {
|
|
107
|
+
return [
|
|
108
|
+
'## Changes',
|
|
109
|
+
'```',
|
|
110
|
+
diff.stat.trim() || '(no diffstat)',
|
|
111
|
+
'```',
|
|
112
|
+
'',
|
|
113
|
+
buildDiffSection(diff),
|
|
114
|
+
'',
|
|
115
|
+
'## Review lenses (project invariants to check)',
|
|
116
|
+
'',
|
|
117
|
+
...lensSections,
|
|
118
|
+
].join('\n');
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const SEVERITY_EMOJI: Record<ValidatedFinding['severity'], string> = {
|
|
122
|
+
blocker: '🔴',
|
|
123
|
+
warning: '🟡',
|
|
124
|
+
note: '🔵',
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
/** A one-line model summary, shown only when a non-default model is in play. */
|
|
128
|
+
function renderModelLine(telemetry: PipelineResult['telemetry']): string[] {
|
|
129
|
+
const passKeys = new Set(telemetry.passModels);
|
|
130
|
+
const allDefault =
|
|
131
|
+
passKeys.size === 1 && passKeys.has('default') && telemetry.validatorModel === 'default';
|
|
132
|
+
if (allDefault) return [];
|
|
133
|
+
|
|
134
|
+
const passCounts = new Map<string, number>();
|
|
135
|
+
for (const key of telemetry.passModels) passCounts.set(key, (passCounts.get(key) ?? 0) + 1);
|
|
136
|
+
const passSummary = [...passCounts.entries()].map(([key, count]) => `${key}×${count}`).join(', ');
|
|
137
|
+
return [`Models — passes: ${passSummary}; validator: ${telemetry.validatorModel}.`];
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/** Render the validated pipeline findings into a Markdown review report. */
|
|
141
|
+
export function renderPipelineReport(result: PipelineResult, diff: DiffSource): string {
|
|
142
|
+
const { findings, telemetry } = result;
|
|
143
|
+
const counts = {
|
|
144
|
+
blocker: findings.filter((finding) => finding.severity === 'blocker').length,
|
|
145
|
+
warning: findings.filter((finding) => finding.severity === 'warning').length,
|
|
146
|
+
note: findings.filter((finding) => finding.severity === 'note').length,
|
|
147
|
+
};
|
|
148
|
+
|
|
149
|
+
const header = [
|
|
150
|
+
`# Code Review — ${new Date().toISOString().slice(0, 10)}`,
|
|
151
|
+
'',
|
|
152
|
+
`Reviewed ${diff.label} across ${telemetry.passes} adversarial pass(es)` +
|
|
153
|
+
`${telemetry.failedPasses ? ` (${telemetry.failedPasses} failed)` : ''}.`,
|
|
154
|
+
'',
|
|
155
|
+
`**${findings.length} finding(s)** — ${counts.blocker} blocker, ${counts.warning} warning, ${counts.note} note.`,
|
|
156
|
+
`Pipeline: ${telemetry.buckets} buckets → ${telemetry.candidates} candidates → ${telemetry.validated} validated` +
|
|
157
|
+
` (dropped ${telemetry.droppedFalsePositives} false-positive, ${telemetry.droppedLowSignal} low-signal).`,
|
|
158
|
+
...renderModelLine(telemetry),
|
|
159
|
+
'',
|
|
160
|
+
];
|
|
161
|
+
|
|
162
|
+
// A pass fails when its model call errors; failures are swallowed into 0
|
|
163
|
+
// findings, so an all-failed run must NOT masquerade as a clean review.
|
|
164
|
+
const someFailed = telemetry.failedPasses > 0;
|
|
165
|
+
const allFailed = telemetry.passes > 0 && telemetry.failedPasses >= telemetry.passes;
|
|
166
|
+
const errSuffix = telemetry.passErrorSample ? ` — e.g. ${telemetry.passErrorSample}` : '';
|
|
167
|
+
|
|
168
|
+
if (findings.length === 0) {
|
|
169
|
+
if (allFailed) {
|
|
170
|
+
return [
|
|
171
|
+
...header,
|
|
172
|
+
`> ⚠️ **Inconclusive — all ${telemetry.passes} review pass(es) failed${errSuffix}.**`,
|
|
173
|
+
'> No analysis actually ran; this is NOT a clean result. Re-run the review',
|
|
174
|
+
'> (check that the review model / pi-ai is available) before trusting it.',
|
|
175
|
+
].join('\n');
|
|
176
|
+
}
|
|
177
|
+
if (someFailed) {
|
|
178
|
+
return [
|
|
179
|
+
...header,
|
|
180
|
+
`> ⚠️ **Partial review — ${telemetry.failedPasses}/${telemetry.passes} pass(es) failed${errSuffix}.**`,
|
|
181
|
+
`> The ${telemetry.passes - telemetry.failedPasses} surviving pass(es) found nothing, but coverage was reduced.`,
|
|
182
|
+
].join('\n');
|
|
183
|
+
}
|
|
184
|
+
return [...header, 'No bugs found that survived validation. ✅'].join('\n');
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const partialWarning = someFailed
|
|
188
|
+
? [
|
|
189
|
+
`> ⚠️ **Partial review — ${telemetry.failedPasses}/${telemetry.passes} pass(es) failed${errSuffix}; findings below may be incomplete.**`,
|
|
190
|
+
'',
|
|
191
|
+
]
|
|
192
|
+
: [];
|
|
193
|
+
|
|
194
|
+
// Only attribute models per finding when more than one distinct model ran
|
|
195
|
+
// (a bake-off); with a single model it's noise.
|
|
196
|
+
const multiModel = new Set(telemetry.passModels).size > 1;
|
|
197
|
+
const lines = findings.map((finding) => {
|
|
198
|
+
const where = finding.line ? `\`${finding.file}:${finding.line}\`` : `\`${finding.file}\``;
|
|
199
|
+
const meta = [
|
|
200
|
+
`${finding.votes}/${telemetry.passes} votes`,
|
|
201
|
+
`${Math.round(finding.confidence * 100)}% conf`,
|
|
202
|
+
finding.category,
|
|
203
|
+
multiModel && finding.models.length > 0 ? `models: ${finding.models.join(', ')}` : undefined,
|
|
204
|
+
]
|
|
205
|
+
.filter(Boolean)
|
|
206
|
+
.join(', ');
|
|
207
|
+
const justification = finding.justification ? `\n ↳ ${finding.justification}` : '';
|
|
208
|
+
return `- ${SEVERITY_EMOJI[finding.severity]} **${finding.severity}** ${where} — ${finding.message} _(${meta})_${justification}`;
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
return [...header, ...partialWarning, '## Findings', '', ...lines].join('\n');
|
|
212
|
+
}
|
|
213
|
+
|
|
100
214
|
/** Build the lens-specific section of the review prompt (no diff duplication). */
|
|
101
215
|
export function buildLensSection(
|
|
102
216
|
lens: LensConfig,
|
|
@@ -25,6 +25,119 @@ export type LensResult = {
|
|
|
25
25
|
_lensSection?: string;
|
|
26
26
|
};
|
|
27
27
|
|
|
28
|
+
// ── Self-driving review pipeline (Bugbot-style) ──────────────────────────────
|
|
29
|
+
//
|
|
30
|
+
// The tool can run the review itself by driving the session's model through
|
|
31
|
+
// several parallel adversarial passes, bucketing + majority-voting the
|
|
32
|
+
// findings, then validating each survivor — instead of returning a prompt for
|
|
33
|
+
// a single downstream pass. The types below describe that pipeline's data.
|
|
34
|
+
|
|
35
|
+
/** A finding as emitted by one bug-finding pass (before bucketing). */
|
|
36
|
+
export type RawFinding = {
|
|
37
|
+
file: string;
|
|
38
|
+
line?: number;
|
|
39
|
+
severity: LensSeverity;
|
|
40
|
+
message: string;
|
|
41
|
+
/** Optional bug taxonomy tag the pass assigned (e.g. "boundary-input"). */
|
|
42
|
+
category?: string;
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
/** A merged bucket of near-duplicate raw findings across passes. */
|
|
46
|
+
export type CandidateFinding = RawFinding & {
|
|
47
|
+
/** Number of DISTINCT passes that independently surfaced this bucket. */
|
|
48
|
+
votes: number;
|
|
49
|
+
/** Indices of the passes that contributed (0-based). */
|
|
50
|
+
passIndices: number[];
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
/** A candidate after the validator stage has confirmed or refuted it. */
|
|
54
|
+
export type ValidatedFinding = CandidateFinding & {
|
|
55
|
+
verdict: 'real' | 'false-positive';
|
|
56
|
+
/** Validator confidence in `verdict`, 0..1. */
|
|
57
|
+
confidence: number;
|
|
58
|
+
justification?: string;
|
|
59
|
+
/** Distinct model keys whose passes contributed to this finding (for the
|
|
60
|
+
* model bake-off: "which model caught this"). */
|
|
61
|
+
models: string[];
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
/** Reasoning/thinking effort for a step (mirrors pi-ai's `ThinkingLevel`). */
|
|
65
|
+
export type ReasoningLevel = 'minimal' | 'low' | 'medium' | 'high' | 'xhigh';
|
|
66
|
+
|
|
67
|
+
/** A per-step model choice in config: either a bare spec string
|
|
68
|
+
* ("provider/id", id, or name) or that spec plus a reasoning level. */
|
|
69
|
+
export type ModelSpec = { model: string; reasoning?: ReasoningLevel };
|
|
70
|
+
export type ModelStepConfig = string | ModelSpec;
|
|
71
|
+
|
|
72
|
+
/** A resolved per-step assignment the pipeline runs against. `key` is either
|
|
73
|
+
* {@link DEFAULT_MODEL_KEY} (the session model) or a spec that resolved to a
|
|
74
|
+
* real model; `label` is the human display (key + reasoning). */
|
|
75
|
+
export type ModelAssignment = {
|
|
76
|
+
key: string;
|
|
77
|
+
label: string;
|
|
78
|
+
reasoning?: ReasoningLevel;
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
export type ModelPlan = {
|
|
82
|
+
/** Assignment for each pass, length === `passes` (round-robin from config). */
|
|
83
|
+
passes: ModelAssignment[];
|
|
84
|
+
/** Assignment for the validator stage. */
|
|
85
|
+
validator: ModelAssignment;
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
/** Counts describing what the pipeline did, for transparency in the report. */
|
|
89
|
+
export type PipelineTelemetry = {
|
|
90
|
+
passes: number;
|
|
91
|
+
passFindingCounts: number[];
|
|
92
|
+
buckets: number;
|
|
93
|
+
candidates: number;
|
|
94
|
+
validated: number;
|
|
95
|
+
droppedFalsePositives: number;
|
|
96
|
+
droppedLowSignal: number;
|
|
97
|
+
failedPasses: number;
|
|
98
|
+
/** A representative error message from the first failed pass, surfaced so a
|
|
99
|
+
* fully-failed run reports WHY instead of a misleading "0 findings". */
|
|
100
|
+
passErrorSample?: string;
|
|
101
|
+
/** Model key used for each pass (parallel to pass index). */
|
|
102
|
+
passModels: string[];
|
|
103
|
+
/** Model key used for the validator stage. */
|
|
104
|
+
validatorModel: string;
|
|
105
|
+
};
|
|
106
|
+
|
|
107
|
+
export type PipelineResult = {
|
|
108
|
+
findings: ValidatedFinding[];
|
|
109
|
+
telemetry: PipelineTelemetry;
|
|
110
|
+
};
|
|
111
|
+
|
|
112
|
+
/** Tunables for the self-driving pipeline (all overridable in config). */
|
|
113
|
+
export type ReviewPipelineConfig = {
|
|
114
|
+
/** Parallel adversarial bug-finding passes. 0 disables the pipeline
|
|
115
|
+
* (falls back to returning a single-pass review prompt). */
|
|
116
|
+
passes: number;
|
|
117
|
+
/** Run the validator stage that falsifies each surviving candidate. */
|
|
118
|
+
validate: boolean;
|
|
119
|
+
/** Min distinct passes a NOTE-severity bucket needs to survive pre-validation
|
|
120
|
+
* (blockers/warnings are never dropped for low votes). */
|
|
121
|
+
minVotes: number;
|
|
122
|
+
/** Max passes run concurrently. */
|
|
123
|
+
concurrency: number;
|
|
124
|
+
/** Base sampling temperature; each pass adds a small deterministic jitter so
|
|
125
|
+
* passes diverge instead of collapsing onto identical reasoning. */
|
|
126
|
+
temperature: number;
|
|
127
|
+
/** Hard cap on findings returned (safety valve against runaway output). */
|
|
128
|
+
maxFindings: number;
|
|
129
|
+
/** Model for ALL passes — a spec string or `{ model, reasoning }`. Omitted →
|
|
130
|
+
* session model. Overridden per-pass by {@link passModels}. */
|
|
131
|
+
passModel?: ModelStepConfig;
|
|
132
|
+
/** Models rotated round-robin across passes — run the same diff through
|
|
133
|
+
* several models/reasoning levels in one review (a bake-off). Overrides
|
|
134
|
+
* `passModel`. */
|
|
135
|
+
passModels?: ModelStepConfig[];
|
|
136
|
+
/** Model for the validator stage — a spec string or `{ model, reasoning }`.
|
|
137
|
+
* Omitted → session model. */
|
|
138
|
+
validateModel?: ModelStepConfig;
|
|
139
|
+
};
|
|
140
|
+
|
|
28
141
|
// NOTE: findings + summary on LensResult describe what the agent produces in
|
|
29
142
|
// its follow-up message; the tool/command layer emits a review *task*, it does
|
|
30
143
|
// not parse findings back into a rendered report.
|
|
@@ -38,4 +151,6 @@ export type ReviewConfig = {
|
|
|
38
151
|
/** Max lens tools run in parallel. Tools are deduped across lenses first,
|
|
39
152
|
* so this bounds the distinct command set, not lens count. */
|
|
40
153
|
toolConcurrency: number;
|
|
154
|
+
/** Self-driving pipeline tunables (see {@link ReviewPipelineConfig}). */
|
|
155
|
+
review: ReviewPipelineConfig;
|
|
41
156
|
};
|
package/package.json
CHANGED
|
@@ -10,11 +10,25 @@ Evaluates changes for correctness, dead code introduction, and adherence to proj
|
|
|
10
10
|
- Are there any obvious bugs or logic errors?
|
|
11
11
|
- Does the code avoid known anti-patterns for the project's framework?
|
|
12
12
|
|
|
13
|
+
### Adversarial inputs (enumerate, don't assume)
|
|
14
|
+
For each changed function, construct the edge inputs that break it rather than
|
|
15
|
+
trusting the happy path or the surrounding comment:
|
|
16
|
+
- `null` / `undefined` / `NaN` / `Infinity` / `-0` / `""` / `[]` / `{}` / huge /
|
|
17
|
+
negative / duplicate / out-of-order / unicode.
|
|
18
|
+
- Numeric-type guards that the wrong value defeats: `typeof NaN === "number"`,
|
|
19
|
+
`typeof null === "object"`, `0`/`""`/`NaN` as falsy, `JSON.parse` of
|
|
20
|
+
attacker input. Prefer `Number.isFinite` / explicit checks.
|
|
21
|
+
- **Claim-vs-code audit:** every comment or test that asserts an invariant
|
|
22
|
+
("non-numeric falls through", "never empty") — find the input that violates it
|
|
23
|
+
and confirm the code actually enforces the claim.
|
|
24
|
+
- Off-by-one, boundary indices, wrong id/key space, missing `await`, swallowed
|
|
25
|
+
errors, unhandled rejection, cancellation/abort paths.
|
|
26
|
+
|
|
13
27
|
## Tools
|
|
14
28
|
- `bun run typecheck`
|
|
15
29
|
- `bun run lint`
|
|
16
30
|
|
|
17
31
|
## Severity
|
|
18
|
-
- blocker: Type errors, unresolved imports, obvious bugs, unhandled error paths
|
|
19
|
-
- warning: New lint violations, unused code, inconsistent naming
|
|
32
|
+
- blocker: Type errors, unresolved imports, obvious bugs, unhandled error paths, an edge input (NaN/empty/boundary) that crashes or corrupts on a path users hit
|
|
33
|
+
- warning: New lint violations, unused code, inconsistent naming, an unguarded edge input on a lower-risk path, a comment/test claim the code does not actually honor
|
|
20
34
|
- note: Style suggestions, minor improvements
|