mstro-app 0.4.13 → 0.4.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/dist/server/cli/headless/claude-invoker-process.d.ts.map +1 -1
  2. package/dist/server/cli/headless/claude-invoker-process.js +5 -1
  3. package/dist/server/cli/headless/claude-invoker-process.js.map +1 -1
  4. package/dist/server/services/file-explorer-ops.d.ts +1 -1
  5. package/dist/server/services/file-explorer-ops.d.ts.map +1 -1
  6. package/dist/server/services/file-explorer-ops.js +7 -2
  7. package/dist/server/services/file-explorer-ops.js.map +1 -1
  8. package/dist/server/services/plan/composer.d.ts +1 -1
  9. package/dist/server/services/plan/composer.d.ts.map +1 -1
  10. package/dist/server/services/plan/composer.js +3 -2
  11. package/dist/server/services/plan/composer.js.map +1 -1
  12. package/dist/server/services/plan/executor.d.ts +5 -0
  13. package/dist/server/services/plan/executor.d.ts.map +1 -1
  14. package/dist/server/services/plan/executor.js +32 -1
  15. package/dist/server/services/plan/executor.js.map +1 -1
  16. package/dist/server/services/plan/parser-core.d.ts.map +1 -1
  17. package/dist/server/services/plan/parser-core.js +1 -0
  18. package/dist/server/services/plan/parser-core.js.map +1 -1
  19. package/dist/server/services/plan/review-gate.d.ts +2 -0
  20. package/dist/server/services/plan/review-gate.d.ts.map +1 -1
  21. package/dist/server/services/plan/review-gate.js +25 -3
  22. package/dist/server/services/plan/review-gate.js.map +1 -1
  23. package/dist/server/services/plan/types.d.ts +2 -0
  24. package/dist/server/services/plan/types.d.ts.map +1 -1
  25. package/dist/server/services/sandbox-utils.d.ts +3 -1
  26. package/dist/server/services/sandbox-utils.d.ts.map +1 -1
  27. package/dist/server/services/sandbox-utils.js +6 -3
  28. package/dist/server/services/sandbox-utils.js.map +1 -1
  29. package/dist/server/services/websocket/file-explorer-handlers.js +2 -1
  30. package/dist/server/services/websocket/file-explorer-handlers.js.map +1 -1
  31. package/dist/server/services/websocket/git-log-handlers.d.ts.map +1 -1
  32. package/dist/server/services/websocket/git-log-handlers.js +29 -9
  33. package/dist/server/services/websocket/git-log-handlers.js.map +1 -1
  34. package/dist/server/services/websocket/git-worktree-handlers.js +8 -0
  35. package/dist/server/services/websocket/git-worktree-handlers.js.map +1 -1
  36. package/dist/server/services/websocket/handler.d.ts.map +1 -1
  37. package/dist/server/services/websocket/handler.js +5 -3
  38. package/dist/server/services/websocket/handler.js.map +1 -1
  39. package/dist/server/services/websocket/plan-execution-handlers.d.ts.map +1 -1
  40. package/dist/server/services/websocket/plan-execution-handlers.js +4 -1
  41. package/dist/server/services/websocket/plan-execution-handlers.js.map +1 -1
  42. package/dist/server/services/websocket/plan-helpers.js +1 -1
  43. package/dist/server/services/websocket/plan-helpers.js.map +1 -1
  44. package/dist/server/services/websocket/quality-handlers.d.ts +1 -1
  45. package/dist/server/services/websocket/quality-handlers.d.ts.map +1 -1
  46. package/dist/server/services/websocket/quality-handlers.js +67 -14
  47. package/dist/server/services/websocket/quality-handlers.js.map +1 -1
  48. package/dist/server/services/websocket/quality-persistence.d.ts +2 -0
  49. package/dist/server/services/websocket/quality-persistence.d.ts.map +1 -1
  50. package/dist/server/services/websocket/quality-persistence.js +33 -2
  51. package/dist/server/services/websocket/quality-persistence.js.map +1 -1
  52. package/dist/server/services/websocket/quality-review-agent.d.ts +33 -0
  53. package/dist/server/services/websocket/quality-review-agent.d.ts.map +1 -1
  54. package/dist/server/services/websocket/quality-review-agent.js +360 -72
  55. package/dist/server/services/websocket/quality-review-agent.js.map +1 -1
  56. package/dist/server/services/websocket/quality-types.d.ts +3 -0
  57. package/dist/server/services/websocket/quality-types.d.ts.map +1 -1
  58. package/dist/server/services/websocket/quality-types.js.map +1 -1
  59. package/package.json +1 -1
  60. package/server/cli/headless/claude-invoker-process.ts +6 -1
  61. package/server/services/file-explorer-ops.ts +7 -2
  62. package/server/services/plan/composer.ts +3 -1
  63. package/server/services/plan/executor.ts +32 -1
  64. package/server/services/plan/parser-core.ts +1 -0
  65. package/server/services/plan/review-gate.ts +28 -3
  66. package/server/services/plan/types.ts +2 -0
  67. package/server/services/sandbox-utils.ts +7 -3
  68. package/server/services/websocket/file-explorer-handlers.ts +2 -1
  69. package/server/services/websocket/git-log-handlers.ts +30 -9
  70. package/server/services/websocket/git-worktree-handlers.ts +9 -0
  71. package/server/services/websocket/handler.ts +6 -3
  72. package/server/services/websocket/plan-execution-handlers.ts +4 -1
  73. package/server/services/websocket/plan-helpers.ts +1 -1
  74. package/server/services/websocket/quality-handlers.ts +69 -9
  75. package/server/services/websocket/quality-persistence.ts +32 -2
  76. package/server/services/websocket/quality-review-agent.ts +427 -72
  77. package/server/services/websocket/quality-types.ts +3 -0
@@ -7,6 +7,8 @@
7
7
  * Builds the review prompt, runs the agent, parses findings, and persists results.
8
8
  */
9
9
 
10
+ import { existsSync, readFileSync } from 'node:fs';
11
+ import { isAbsolute, join } from 'node:path';
10
12
  import { runWithFileLogger } from '../../cli/headless/headless-logger.js';
11
13
  import { HeadlessRunner } from '../../cli/headless/index.js';
12
14
  import type { ToolUseEvent } from '../../cli/headless/types.js';
@@ -25,6 +27,9 @@ export interface CodeReviewFinding {
25
27
  title: string;
26
28
  description: string;
27
29
  suggestion?: string;
30
+ evidence?: string;
31
+ verified?: boolean;
32
+ verificationNote?: string;
28
33
  }
29
34
 
30
35
  // ── Prompt ────────────────────────────────────────────────────
@@ -66,11 +71,35 @@ ${cliFindingsSection}
66
71
  ### Performance
67
72
  - N+1 queries, unnecessary re-renders, missing memoization, blocking I/O in hot paths, unbounded data structures, missing pagination
68
73
 
74
+ ## CRITICAL — Structured Evidence Requirement
75
+
76
+ For EACH finding, you MUST provide structured evidence that grounds the finding in actual code. This is required to prevent false positives.
77
+
78
+ For each finding, use this reasoning process:
79
+
80
+ 1. **PREMISE**: State the observable fact from the code. Quote the exact code you see.
81
+ 2. **CONTEXT**: What is the surrounding code doing? Are there guards, fixes, or patterns elsewhere that might handle this?
82
+ 3. **COUNTER-CHECK**: Actively look for evidence that CONTRADICTS your finding. Check for:
83
+ - Guards or validation earlier in the call chain
84
+ - Error handling wrapping the code
85
+ - Configuration that changes behavior (e.g., NODE_ENV checks)
86
+ - Comments explaining intentional design choices
87
+ 4. **CONCLUSION**: Only report the finding if you could not find contradicting evidence.
88
+
89
+ ### Common False Positive Patterns to AVOID
90
+
91
+ - Claiming a function uses API X when it actually uses API Y (e.g., claiming Math.random() when code uses crypto.randomInt()) — ALWAYS quote the actual function call
92
+ - Claiming a header/value is leaked when code already deletes/filters it — READ the full function
93
+ - Claiming there's no guard when a condition check exists nearby — READ surrounding lines
94
+ - Claiming N fields/methods when the actual count differs — COUNT explicitly
95
+ - Claiming a resource leaks when cleanup exists in a different handler — SEARCH for the cleanup code
96
+
69
97
  ## Rules
70
98
 
71
- - Only report findings you are >80% confident about. No speculative or low-confidence issues.
99
+ - Only report findings you are >90% confident about after completing the counter-check step.
72
100
  - Focus on architecture, SOLID violations, bugs, and security over style nits.
73
101
  - Each finding MUST reference a specific file and line number. Do not report vague or file-level issues.
102
+ - Each finding MUST include an "evidence" field with the exact code snippet (1-5 lines) proving the issue exists.
74
103
  - Limit to the 25 most important findings, ranked by severity.
75
104
  - Do NOT modify any files. This is a read-only review.
76
105
  - Be HONEST about the overall quality. A codebase with serious issues should score low.
@@ -103,7 +132,8 @@ After your analysis, output EXACTLY one JSON code block with your findings. No o
103
132
  "line": 42,
104
133
  "title": "Short title describing the issue",
105
134
  "description": "What the problem is and why it matters.",
106
- "suggestion": "How to fix it."
135
+ "suggestion": "How to fix it.",
136
+ "evidence": "const token = Math.random().toString(36) // exact code from file proving the issue"
107
137
  }
108
138
  ],
109
139
  "summary": "Brief 1-2 sentence summary of overall code quality."
@@ -126,6 +156,7 @@ function normalizeFinding(f: Record<string, unknown>): CodeReviewFinding | null
126
156
  title: f.title as string,
127
157
  description: typeof f.description === 'string' ? f.description : '',
128
158
  suggestion: typeof f.suggestion === 'string' ? f.suggestion : undefined,
159
+ evidence: typeof f.evidence === 'string' ? f.evidence : undefined,
129
160
  };
130
161
  }
131
162
 
@@ -167,6 +198,276 @@ export function parseCodeReviewResponse(response: string): CodeReviewResult {
167
198
  }
168
199
  }
169
200
 
201
+ // ── Phase 3: Deterministic post-validation ───────────────────
202
+ //
203
+ // Fast grep/file-based checks that catch hallucinated references
204
+ // before the more expensive LLM verification pass.
205
+
206
+ interface ValidationResult {
207
+ finding: CodeReviewFinding;
208
+ valid: boolean;
209
+ reason?: string;
210
+ }
211
+
212
+ function resolveFilePath(dirPath: string, filePath: string): string {
213
+ if (isAbsolute(filePath)) return filePath;
214
+ return join(dirPath, filePath);
215
+ }
216
+
217
+ /**
218
+ * Extract keywords/identifiers that the finding's description claims exist in the code.
219
+ * Looks for patterns like: "calls X()", "uses X", "X is called", "X at line N".
220
+ */
221
+ function extractClaimedIdentifiers(description: string, title: string): string[] {
222
+ const identifiers: string[] = [];
223
+ const combined = `${title} ${description}`;
224
+
225
+ // Match function calls: functionName(), ClassName.method()
226
+ const callPatterns = combined.matchAll(/\b([a-zA-Z_$][\w.$]*)\s*\(/g);
227
+ for (const m of callPatterns) {
228
+ const name = m[1];
229
+ // Skip common English words that look like function calls
230
+ if (!['If', 'When', 'While', 'For', 'This', 'That', 'The', 'Each', 'Uses', 'Has', 'Does', 'Returns', 'Takes', 'Calls'].includes(name)) {
231
+ identifiers.push(name);
232
+ }
233
+ }
234
+
235
+ // Match backtick-quoted code: `someCode`
236
+ const backtickPatterns = combined.matchAll(/`([^`]+)`/g);
237
+ for (const m of backtickPatterns) {
238
+ // Extract identifiers from the backtick content
239
+ const inner = m[1].replace(/[()[\]{};,]/g, ' ').trim();
240
+ if (inner && inner.length < 60) {
241
+ identifiers.push(inner);
242
+ }
243
+ }
244
+
245
+ return identifiers;
246
+ }
247
+
248
+ /**
249
+ * Validate findings against the actual filesystem. Returns findings annotated
250
+ * with validation results. Does NOT remove findings — only marks them.
251
+ */
252
+ export function validateFindings(
253
+ findings: CodeReviewFinding[],
254
+ dirPath: string,
255
+ ): { validated: CodeReviewFinding[]; rejected: CodeReviewFinding[]; stats: { total: number; passed: number; failed: number } } {
256
+ const validated: CodeReviewFinding[] = [];
257
+ const rejected: CodeReviewFinding[] = [];
258
+
259
+ for (const finding of findings) {
260
+ const result = validateSingleFinding(finding, dirPath);
261
+ if (result.valid) {
262
+ validated.push({ ...finding, verified: undefined }); // Don't mark yet — Phase 2 does that
263
+ } else {
264
+ rejected.push({ ...finding, verified: false, verificationNote: result.reason });
265
+ }
266
+ }
267
+
268
+ return {
269
+ validated,
270
+ rejected,
271
+ stats: { total: findings.length, passed: validated.length, failed: rejected.length },
272
+ };
273
+ }
274
+
275
+ function readFileContent(filePath: string): string | null {
276
+ try {
277
+ return readFileSync(filePath, 'utf-8');
278
+ } catch {
279
+ return null;
280
+ }
281
+ }
282
+
283
+ const COMMON_KEYWORDS = /^(const|let|var|function|return|import|export|from|this|true|false|null|undefined|new|if|else|for|while|try|catch)$/;
284
+
285
+ function checkLineInRange(content: string, line: number | null): string | null {
286
+ if (line === null || line <= 0) return null;
287
+ const lineCount = content.split('\n').length;
288
+ if (line > lineCount) return `Line ${line} exceeds file length (${lineCount} lines)`;
289
+ return null;
290
+ }
291
+
292
+ function checkEvidenceTokens(content: string, evidence: string | undefined): string | null {
293
+ if (!evidence) return null;
294
+ const evidenceTokens = evidence
295
+ .replace(/['"`;{}[\]()]/g, ' ')
296
+ .split(/\s+/)
297
+ .filter(t => t.length > 3 && !COMMON_KEYWORDS.test(t));
298
+ if (evidenceTokens.length === 0) return null;
299
+ const matchCount = evidenceTokens.filter(token => content.includes(token)).length;
300
+ if (matchCount === 0) return `Evidence tokens not found in file: ${evidenceTokens.slice(0, 3).join(', ')}`;
301
+ return null;
302
+ }
303
+
304
+ function checkClaimedIdentifiers(content: string, finding: CodeReviewFinding): string | null {
305
+ const claimedIds = extractClaimedIdentifiers(finding.description, finding.title);
306
+ if (claimedIds.length < 2) return null;
307
+ const foundAny = claimedIds.some(id => {
308
+ if (id.includes('.')) return content.includes(id);
309
+ return new RegExp(`\\b${id.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`).test(content);
310
+ });
311
+ if (!foundAny) return `Claimed identifiers not found in file: ${claimedIds.slice(0, 3).join(', ')}`;
312
+ return null;
313
+ }
314
+
315
+ function validateSingleFinding(finding: CodeReviewFinding, dirPath: string): ValidationResult {
316
+ const filePath = resolveFilePath(dirPath, finding.file);
317
+
318
+ if (!existsSync(filePath)) {
319
+ return { finding, valid: false, reason: `File does not exist: ${finding.file}` };
320
+ }
321
+
322
+ const content = readFileContent(filePath);
323
+ if (!content) return { finding, valid: true }; // Can't read — don't reject
324
+
325
+ const lineErr = checkLineInRange(content, finding.line);
326
+ if (lineErr) return { finding, valid: false, reason: lineErr };
327
+
328
+ const evidenceErr = checkEvidenceTokens(content, finding.evidence);
329
+ if (evidenceErr) return { finding, valid: false, reason: evidenceErr };
330
+
331
+ const idErr = checkClaimedIdentifiers(content, finding);
332
+ if (idErr) return { finding, valid: false, reason: idErr };
333
+
334
+ return { finding, valid: true };
335
+ }
336
+
337
+ // ── Phase 2: LLM verification pass ──────────────────────────
338
+ //
339
+ // Runs a second headless Claude pass that independently verifies
340
+ // each finding against the actual code.
341
+
342
+ export function buildVerificationPrompt(
343
+ dirPath: string,
344
+ findings: CodeReviewFinding[],
345
+ ): string {
346
+ const findingsJson = findings.map((f, i) => ({
347
+ id: i + 1,
348
+ severity: f.severity,
349
+ category: f.category,
350
+ file: f.file,
351
+ line: f.line,
352
+ title: f.title,
353
+ description: f.description,
354
+ evidence: f.evidence || '(none provided)',
355
+ }));
356
+
357
+ return `You are an independent code review VERIFIER. A separate reviewer produced the findings below. Your job is to VERIFY each finding against the actual code. You are a skeptic — do NOT trust the original reviewer's claims.
358
+
359
+ IMPORTANT: Your current working directory is "${dirPath}". Only read files within this directory.
360
+
361
+ ## Findings to Verify
362
+
363
+ ${JSON.stringify(findingsJson, null, 2)}
364
+
365
+ ## Verification Process
366
+
367
+ For EACH finding:
368
+
369
+ 1. **Read the cited file and line** using the Read tool. Read at least 20 lines around the cited line for context.
370
+ 2. **Check the specific claim** in the description. Does the code actually do what the finding claims?
371
+ 3. **Search for counter-evidence**:
372
+ - If the finding claims something is missing (no validation, no cleanup, no guard): search for it with Grep
373
+ - If the finding claims an API is used: verify the actual API call at that line
374
+ - If the finding claims a value is leaked/exposed: check if it's filtered/deleted elsewhere in the same function
375
+ 4. **Verdict**: Mark as "confirmed" or "rejected" with a brief explanation
376
+
377
+ ## Rules
378
+
379
+ - You MUST actually Read each cited file. Do not rely on memory or assumptions.
380
+ - Use Grep to search for patterns the finding claims exist (or don't exist).
381
+ - A finding is "rejected" if:
382
+ - The code does NOT match what the description claims
383
+ - There IS a guard/fix that the finding claims is missing
384
+ - The line number doesn't contain the relevant code
385
+ - The finding is about a different version of the code than what exists now
386
+ - A finding is "confirmed" if you can independently verify the issue exists in the current code.
387
+ - Be thorough but efficient — focus verification effort on high/critical severity findings.
388
+
389
+ ## Output
390
+
391
+ Output EXACTLY one JSON code block. No other text after the JSON block.
392
+
393
+ \`\`\`json
394
+ {
395
+ "verifications": [
396
+ {
397
+ "id": 1,
398
+ "verdict": "confirmed|rejected",
399
+ "confidence": 0.95,
400
+ "note": "Brief explanation of what you found when checking the code"
401
+ }
402
+ ]
403
+ }
404
+ \`\`\``;
405
+ }
406
+
407
+ interface VerificationVerdict {
408
+ id: number;
409
+ verdict: 'confirmed' | 'rejected';
410
+ confidence: number;
411
+ note: string;
412
+ }
413
+
414
+ export function parseVerificationResponse(response: string): VerificationVerdict[] {
415
+ const jsonStr = extractJson(response);
416
+ try {
417
+ const parsed = JSON.parse(jsonStr);
418
+ const raw: Record<string, unknown>[] = Array.isArray(parsed.verifications) ? parsed.verifications : [];
419
+ return raw
420
+ .filter((v): v is Record<string, unknown> & { id: number } =>
421
+ typeof v.id === 'number' && typeof v.verdict === 'string')
422
+ .map(v => ({
423
+ id: v.id as number,
424
+ verdict: v.verdict === 'rejected' ? 'rejected' as const : 'confirmed' as const,
425
+ confidence: typeof v.confidence === 'number' ? v.confidence : 0.5,
426
+ note: typeof v.note === 'string' ? v.note : '',
427
+ }));
428
+ } catch {
429
+ return [];
430
+ }
431
+ }
432
+
433
+ /**
434
+ * Apply verification verdicts to findings.
435
+ * Confirmed findings get verified=true. Rejected findings are removed.
436
+ */
437
+ export function applyVerification(
438
+ findings: CodeReviewFinding[],
439
+ verdicts: VerificationVerdict[],
440
+ ): { verified: CodeReviewFinding[]; rejected: CodeReviewFinding[] } {
441
+ const verdictMap = new Map(verdicts.map(v => [v.id, v]));
442
+ const verified: CodeReviewFinding[] = [];
443
+ const rejected: CodeReviewFinding[] = [];
444
+
445
+ for (let i = 0; i < findings.length; i++) {
446
+ const verdict = verdictMap.get(i + 1);
447
+ if (!verdict) {
448
+ // No verdict — keep finding but mark unverified
449
+ verified.push({ ...findings[i], verified: undefined });
450
+ continue;
451
+ }
452
+
453
+ if (verdict.verdict === 'confirmed' && verdict.confidence >= 0.6) {
454
+ verified.push({
455
+ ...findings[i],
456
+ verified: true,
457
+ verificationNote: verdict.note || undefined,
458
+ });
459
+ } else {
460
+ rejected.push({
461
+ ...findings[i],
462
+ verified: false,
463
+ verificationNote: verdict.note || undefined,
464
+ });
465
+ }
466
+ }
467
+
468
+ return { verified, rejected };
469
+ }
470
+
170
471
  // ── Progress tracking ─────────────────────────────────────────
171
472
 
172
473
  const TOOL_START_MESSAGES: Record<string, string> = {
@@ -202,6 +503,99 @@ function createCodeReviewProgressTracker() {
202
503
  };
203
504
  }
204
505
 
506
+ // ── Handler helpers ───────────────────────────────────────────
507
+
508
+ type ProgressSender = (message: string) => void;
509
+
510
+ function makeProgressSender(ctx: HandlerContext, ws: WSContext, reportPath: string): ProgressSender {
511
+ return (message: string) => {
512
+ ctx.send(ws, { type: 'qualityCodeReviewProgress', data: { path: reportPath, message } });
513
+ };
514
+ }
515
+
516
+ function makeToolCallback(send: ProgressSender, prefix?: string): (event: ToolUseEvent) => void {
517
+ const getProgressMessage = createCodeReviewProgressTracker();
518
+ return (event: ToolUseEvent) => {
519
+ const message = getProgressMessage(event);
520
+ if (message) send(prefix ? `${prefix}${message}` : message);
521
+ };
522
+ }
523
+
524
+ function loadCliFindings(
525
+ getPersistence: (dir: string) => QualityPersistence,
526
+ workingDir: string,
527
+ reportPath: string,
528
+ ): Array<{ severity: string; category: string; file: string; line: number | null; title: string; description: string }> | undefined {
529
+ try {
530
+ const persistence = getPersistence(workingDir);
531
+ const existingReport = persistence.loadReport(reportPath);
532
+ return existingReport?.findings;
533
+ } catch {
534
+ return undefined;
535
+ }
536
+ }
537
+
538
+ async function runVerificationPass(
539
+ dirPath: string,
540
+ findings: CodeReviewFinding[],
541
+ send: ProgressSender,
542
+ ): Promise<CodeReviewFinding[]> {
543
+ send(`Verifying ${findings.length} findings against actual code...`);
544
+
545
+ const verificationRunner = new HeadlessRunner({
546
+ workingDir: dirPath,
547
+ directPrompt: buildVerificationPrompt(dirPath, findings),
548
+ stallWarningMs: 120_000,
549
+ stallKillMs: 300_000,
550
+ stallHardCapMs: 600_000,
551
+ toolUseCallback: makeToolCallback(send, 'Verifying: '),
552
+ });
553
+
554
+ const verifyResult = await runWithFileLogger('code-review-verify', () => verificationRunner.run());
555
+ const verdicts = parseVerificationResponse(verifyResult.assistantResponse || '');
556
+
557
+ if (verdicts.length === 0) return findings; // No verdicts — keep all as-is
558
+
559
+ const { verified, rejected } = applyVerification(findings, verdicts);
560
+ if (rejected.length > 0) {
561
+ send(`Verification rejected ${rejected.length} inaccurate finding(s)`);
562
+ }
563
+ return verified;
564
+ }
565
+
566
+ function persistReviewResults(
567
+ reviewResult: CodeReviewResult,
568
+ reportPath: string,
569
+ getPersistence: (dir: string) => QualityPersistence,
570
+ workingDir: string,
571
+ ): import('./quality-service.js').QualityResults | null {
572
+ const persistence = getPersistence(workingDir);
573
+ const existingReport = persistence.loadReport(reportPath);
574
+ if (!existingReport) {
575
+ persistence.saveCodeReview(reportPath, reviewResult.findings as unknown as Record<string, unknown>[], reviewResult.summary);
576
+ return null;
577
+ }
578
+
579
+ let updatedResults: import('./quality-service.js').QualityResults;
580
+ if (reviewResult.score !== null && reviewResult.grade !== null) {
581
+ updatedResults = {
582
+ ...existingReport,
583
+ overall: reviewResult.score,
584
+ grade: reviewResult.grade,
585
+ codeReview: reviewResult.findings as unknown as typeof existingReport.codeReview,
586
+ scoreRationale: reviewResult.scoreRationale ?? undefined,
587
+ };
588
+ } else {
589
+ updatedResults = recomputeWithAiReview(existingReport, reviewResult.findings);
590
+ updatedResults = { ...updatedResults, codeReview: reviewResult.findings as unknown as typeof updatedResults.codeReview };
591
+ }
592
+
593
+ persistence.saveReport(reportPath, updatedResults);
594
+ persistence.appendHistory(updatedResults, reportPath);
595
+ persistence.saveCodeReview(reportPath, reviewResult.findings as unknown as Record<string, unknown>[], reviewResult.summary);
596
+ return updatedResults;
597
+ }
598
+
205
599
  // ── Handler ───────────────────────────────────────────────────
206
600
 
207
601
  export async function handleCodeReview(
@@ -214,104 +608,65 @@ export async function handleCodeReview(
214
608
  getPersistence: (dir: string) => QualityPersistence,
215
609
  ): Promise<void> {
216
610
  if (activeReviews.has(dirPath)) {
217
- ctx.send(ws, {
218
- type: 'qualityError',
219
- data: { path: reportPath, error: 'A code review is already running for this directory.' },
220
- });
611
+ ctx.send(ws, { type: 'qualityError', data: { path: reportPath, error: 'A code review is already running for this directory.' } });
221
612
  return;
222
613
  }
223
614
 
224
615
  activeReviews.add(dirPath);
225
- try {
226
- ctx.send(ws, {
227
- type: 'qualityCodeReviewProgress',
228
- data: { path: reportPath, message: 'Starting AI code review...' },
229
- });
616
+ const send = makeProgressSender(ctx, ws, reportPath);
230
617
 
231
- // Load CLI findings from the existing report to pass to the AI reviewer
232
- let cliFindings: Array<{ severity: string; category: string; file: string; line: number | null; title: string; description: string }> | undefined;
233
- try {
234
- const persistence = getPersistence(workingDir);
235
- const existingReport = persistence.loadReport(reportPath);
236
- if (existingReport?.findings) {
237
- cliFindings = existingReport.findings;
238
- }
239
- } catch {
240
- // Continue without CLI findings if persistence fails
241
- }
618
+ try {
619
+ send('Starting AI code review...');
620
+ const cliFindings = loadCliFindings(getPersistence, workingDir, reportPath);
242
621
 
622
+ // ── Pass 1: Initial AI code review ──────────────────────
243
623
  const runner = new HeadlessRunner({
244
624
  workingDir: dirPath,
245
625
  directPrompt: buildCodeReviewPrompt(dirPath, cliFindings),
246
626
  stallWarningMs: 120_000,
247
627
  stallKillMs: 600_000,
248
628
  stallHardCapMs: 900_000,
249
- toolUseCallback: (() => {
250
- const getProgressMessage = createCodeReviewProgressTracker();
251
- return (event: ToolUseEvent) => {
252
- const message = getProgressMessage(event);
253
- if (message) {
254
- ctx.send(ws, {
255
- type: 'qualityCodeReviewProgress',
256
- data: { path: reportPath, message },
257
- });
258
- }
259
- };
260
- })(),
261
- });
262
-
263
- ctx.send(ws, {
264
- type: 'qualityCodeReviewProgress',
265
- data: { path: reportPath, message: 'Claude is analyzing your codebase...' },
629
+ toolUseCallback: makeToolCallback(send),
266
630
  });
267
631
 
632
+ send('Claude is analyzing your codebase...');
268
633
  const result = await runWithFileLogger('code-review', () => runner.run());
634
+ const reviewResult = parseCodeReviewResponse(result.assistantResponse || '');
269
635
 
270
- ctx.send(ws, {
271
- type: 'qualityCodeReviewProgress',
272
- data: { path: reportPath, message: 'Generating review report...' },
273
- });
636
+ // ── Phase 3: Deterministic post-validation ──────────────
637
+ send(`Validating ${reviewResult.findings.length} findings against codebase...`);
638
+ const validation = validateFindings(reviewResult.findings, dirPath);
639
+ if (validation.stats.failed > 0) {
640
+ send(`Filtered ${validation.stats.failed} finding(s) with invalid references`);
641
+ }
274
642
 
275
- const responseText = result.assistantResponse || '';
276
- const reviewResult = parseCodeReviewResponse(responseText);
643
+ // ── Phase 2: LLM verification pass ──────────────────────
644
+ let finalFindings = validation.validated;
645
+ if (finalFindings.length > 0) {
646
+ try {
647
+ finalFindings = await runVerificationPass(dirPath, finalFindings, send);
648
+ } catch {
649
+ send('Verification pass skipped (timeout or error)');
650
+ }
651
+ }
652
+
653
+ // ── Persist and send results ─────────────────────────────
654
+ send('Generating review report...');
655
+ const verifiedReviewResult: CodeReviewResult = { ...reviewResult, findings: finalFindings };
277
656
 
278
- // Use AI-determined score if available, otherwise fall back to recomputation
279
657
  let updatedResults: import('./quality-service.js').QualityResults | null = null;
280
658
  try {
281
- const persistence = getPersistence(workingDir);
282
- const existingReport = persistence.loadReport(reportPath);
283
- if (existingReport) {
284
- if (reviewResult.score !== null && reviewResult.grade !== null) {
285
- // Use the AI-determined score and grade directly
286
- updatedResults = {
287
- ...existingReport,
288
- overall: reviewResult.score,
289
- grade: reviewResult.grade,
290
- codeReview: reviewResult.findings as unknown as typeof existingReport.codeReview,
291
- scoreRationale: reviewResult.scoreRationale ?? undefined,
292
- };
293
- } else {
294
- // Fallback: recompute with weighted formula
295
- updatedResults = recomputeWithAiReview(existingReport, reviewResult.findings);
296
- updatedResults = { ...updatedResults, codeReview: reviewResult.findings as unknown as typeof updatedResults.codeReview };
297
- }
298
- persistence.saveReport(reportPath, updatedResults);
299
- persistence.appendHistory(updatedResults, reportPath);
300
- }
301
- persistence.saveCodeReview(reportPath, reviewResult.findings as unknown as Record<string, unknown>[], reviewResult.summary);
659
+ updatedResults = persistReviewResults(verifiedReviewResult, reportPath, getPersistence, workingDir);
302
660
  } catch {
303
661
  // Persistence failure should not break the review flow
304
662
  }
305
663
 
306
664
  ctx.send(ws, {
307
665
  type: 'qualityCodeReview',
308
- data: { path: reportPath, findings: reviewResult.findings, summary: reviewResult.summary, results: updatedResults },
666
+ data: { path: reportPath, findings: verifiedReviewResult.findings, summary: verifiedReviewResult.summary, results: updatedResults },
309
667
  });
310
668
  } catch (error) {
311
- ctx.send(ws, {
312
- type: 'qualityError',
313
- data: { path: reportPath, error: error instanceof Error ? error.message : String(error) },
314
- });
669
+ ctx.send(ws, { type: 'qualityError', data: { path: reportPath, error: error instanceof Error ? error.message : String(error) } });
315
670
  } finally {
316
671
  activeReviews.delete(dirPath);
317
672
  }
@@ -30,6 +30,9 @@ export interface QualityFinding {
30
30
  title: string;
31
31
  description: string;
32
32
  suggestion?: string;
33
+ evidence?: string;
34
+ verified?: boolean;
35
+ verificationNote?: string;
33
36
  }
34
37
 
35
38
  export interface QualityResults {