@oddessentials/odd-ai-reviewers 1.7.4 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/dist/agents/ai_semantic_review.d.ts.map +1 -1
  2. package/dist/agents/ai_semantic_review.js +7 -1
  3. package/dist/agents/ai_semantic_review.js.map +1 -1
  4. package/dist/agents/control_flow/safe-source-detector.d.ts +48 -0
  5. package/dist/agents/control_flow/safe-source-detector.d.ts.map +1 -0
  6. package/dist/agents/control_flow/safe-source-detector.js +434 -0
  7. package/dist/agents/control_flow/safe-source-detector.js.map +1 -0
  8. package/dist/agents/control_flow/safe-source-patterns.d.ts +61 -0
  9. package/dist/agents/control_flow/safe-source-patterns.d.ts.map +1 -0
  10. package/dist/agents/control_flow/safe-source-patterns.js +137 -0
  11. package/dist/agents/control_flow/safe-source-patterns.js.map +1 -0
  12. package/dist/agents/control_flow/scope-stack.d.ts +167 -0
  13. package/dist/agents/control_flow/scope-stack.d.ts.map +1 -0
  14. package/dist/agents/control_flow/scope-stack.js +448 -0
  15. package/dist/agents/control_flow/scope-stack.js.map +1 -0
  16. package/dist/agents/control_flow/vulnerability-detector.d.ts +13 -0
  17. package/dist/agents/control_flow/vulnerability-detector.d.ts.map +1 -1
  18. package/dist/agents/control_flow/vulnerability-detector.js +630 -35
  19. package/dist/agents/control_flow/vulnerability-detector.js.map +1 -1
  20. package/dist/agents/opencode.d.ts.map +1 -1
  21. package/dist/agents/opencode.js +7 -1
  22. package/dist/agents/opencode.js.map +1 -1
  23. package/dist/agents/pr_agent.d.ts.map +1 -1
  24. package/dist/agents/pr_agent.js +8 -2
  25. package/dist/agents/pr_agent.js.map +1 -1
  26. package/dist/agents/security.d.ts.map +1 -1
  27. package/dist/agents/security.js +1 -0
  28. package/dist/agents/security.js.map +1 -1
  29. package/dist/agents/types.d.ts +6 -0
  30. package/dist/agents/types.d.ts.map +1 -1
  31. package/dist/benchmark/adapter.d.ts +87 -0
  32. package/dist/benchmark/adapter.d.ts.map +1 -0
  33. package/dist/benchmark/adapter.js +298 -0
  34. package/dist/benchmark/adapter.js.map +1 -0
  35. package/dist/benchmark/scoring.d.ts +100 -0
  36. package/dist/benchmark/scoring.d.ts.map +1 -0
  37. package/dist/benchmark/scoring.js +195 -0
  38. package/dist/benchmark/scoring.js.map +1 -0
  39. package/dist/cli/dependencies/schemas.d.ts +3 -3
  40. package/dist/context-loader.d.ts +80 -0
  41. package/dist/context-loader.d.ts.map +1 -0
  42. package/dist/context-loader.js +202 -0
  43. package/dist/context-loader.js.map +1 -0
  44. package/dist/main.d.ts.map +1 -1
  45. package/dist/main.js +131 -4
  46. package/dist/main.js.map +1 -1
  47. package/dist/phases/index.d.ts +1 -1
  48. package/dist/phases/index.d.ts.map +1 -1
  49. package/dist/phases/index.js +1 -1
  50. package/dist/phases/index.js.map +1 -1
  51. package/dist/phases/report.d.ts +8 -1
  52. package/dist/phases/report.d.ts.map +1 -1
  53. package/dist/phases/report.js +52 -5
  54. package/dist/phases/report.js.map +1 -1
  55. package/dist/report/ado.d.ts +2 -0
  56. package/dist/report/ado.d.ts.map +1 -1
  57. package/dist/report/ado.js +9 -23
  58. package/dist/report/ado.js.map +1 -1
  59. package/dist/report/finding-validator.d.ts +130 -0
  60. package/dist/report/finding-validator.d.ts.map +1 -0
  61. package/dist/report/finding-validator.js +347 -0
  62. package/dist/report/finding-validator.js.map +1 -0
  63. package/dist/report/framework-pattern-filter.d.ts +53 -0
  64. package/dist/report/framework-pattern-filter.d.ts.map +1 -0
  65. package/dist/report/framework-pattern-filter.js +189 -0
  66. package/dist/report/framework-pattern-filter.js.map +1 -0
  67. package/dist/report/github.d.ts +2 -0
  68. package/dist/report/github.d.ts.map +1 -1
  69. package/dist/report/github.js +9 -23
  70. package/dist/report/github.js.map +1 -1
  71. package/dist/trust.d.ts +6 -0
  72. package/dist/trust.d.ts.map +1 -1
  73. package/dist/trust.js +2 -0
  74. package/dist/trust.js.map +1 -1
  75. package/package.json +5 -5
@@ -0,0 +1,298 @@
1
+ /**
2
+ * Benchmark Adapter
3
+ *
4
+ * Runs a single benchmark scenario through the control-flow analysis pipeline.
5
+ * This adapter specifically tests the DETERMINISTIC pipeline:
6
+ * - Safe-source detection (Pattern A)
7
+ * - Finding validation / self-contradiction filter (Pattern E)
8
+ * - Vulnerability detection (TP scenarios)
9
+ *
10
+ * LLM-dependent scenarios (Patterns B/C/D) require a different adapter with
11
+ * mock LLM responses. They are marked as .skip in the integration test.
12
+ *
13
+ * This adapter does NOT make network calls - it is purely local AST analysis.
14
+ */
15
+ import ts from 'typescript';
16
+ import { createHash } from 'node:crypto';
17
+ import { readFile, writeFile, mkdir } from 'node:fs/promises';
18
+ import { join } from 'node:path';
19
+ import { VulnerabilityDetector } from '../agents/control_flow/vulnerability-detector.js';
20
+ import { validateFindings } from '../report/finding-validator.js';
21
+ import { createLogger } from '../agents/control_flow/logger.js';
22
+ const DETERMINISTIC_PATTERNS = new Set(['A', 'E']);
23
+ /**
24
+ * Parse unified diff to extract file paths and added-line content.
25
+ * Strips diff headers and +/- prefixes to produce clean source code.
26
+ */
27
+ export function parseDiffFiles(diff) {
28
+ const files = [];
29
+ const diffBlocks = diff.split(/^diff --git /m).filter(Boolean);
30
+ for (const block of diffBlocks) {
31
+ // Extract file path from "a/path b/path" line
32
+ const headerMatch = block.match(/^a\/(.+?) b\/(.+)/m);
33
+ if (!headerMatch?.[2])
34
+ continue;
35
+ const filePath = headerMatch[2];
36
+ const lines = block.split('\n');
37
+ // Collect content lines (added and context lines)
38
+ const contentLines = [];
39
+ let inHunk = false;
40
+ for (const line of lines) {
41
+ if (line.startsWith('@@')) {
42
+ inHunk = true;
43
+ continue;
44
+ }
45
+ if (!inHunk)
46
+ continue;
47
+ // Skip removed lines
48
+ if (line.startsWith('-'))
49
+ continue;
50
+ // Added lines: strip the + prefix
51
+ if (line.startsWith('+')) {
52
+ contentLines.push(line.slice(1));
53
+ }
54
+ else {
55
+ // Context lines (no prefix or space prefix)
56
+ contentLines.push(line.startsWith(' ') ? line.slice(1) : line);
57
+ }
58
+ }
59
+ if (contentLines.length > 0) {
60
+ files.push({ path: filePath, content: contentLines.join('\n') });
61
+ }
62
+ }
63
+ return files;
64
+ }
65
+ // =============================================================================
66
+ // Simple Line Resolver for finding-validator
67
+ // =============================================================================
68
+ /**
69
+ * Create a line resolver that validates lines against the diff content.
70
+ * For benchmark purposes, lines that exist in the parsed diff are valid.
71
+ */
72
+ function createBenchmarkLineResolver(diffFiles, rawDiff) {
73
+ // Build a map of file → max line count from parsed content
74
+ const fileLineCounts = new Map();
75
+ for (const df of diffFiles) {
76
+ fileLineCounts.set(df.path, df.content.split('\n').length);
77
+ }
78
+ // Also parse hunk headers to get actual line ranges from the diff
79
+ const fileLineRanges = new Map();
80
+ let currentFile = '';
81
+ for (const line of rawDiff.split('\n')) {
82
+ const fileMatch = line.match(/^\+\+\+ b\/(.+)/);
83
+ if (fileMatch?.[1]) {
84
+ currentFile = fileMatch[1];
85
+ if (!fileLineRanges.has(currentFile)) {
86
+ fileLineRanges.set(currentFile, []);
87
+ }
88
+ continue;
89
+ }
90
+ const hunkMatch = line.match(/^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@/);
91
+ if (hunkMatch?.[1] && currentFile) {
92
+ const start = parseInt(hunkMatch[1], 10);
93
+ const count = hunkMatch[2] !== undefined ? parseInt(hunkMatch[2], 10) : 1;
94
+ fileLineRanges.get(currentFile)?.push({ start, count });
95
+ }
96
+ }
97
+ return {
98
+ validateLine(file, line) {
99
+ if (line === undefined)
100
+ return { valid: true };
101
+ // Check against hunk ranges
102
+ const ranges = fileLineRanges.get(file);
103
+ if (ranges) {
104
+ for (const range of ranges) {
105
+ if (line >= range.start && line < range.start + range.count) {
106
+ return { valid: true };
107
+ }
108
+ }
109
+ // Line not in any hunk range
110
+ return { valid: false };
111
+ }
112
+ // Fallback: check against file line count
113
+ const maxLines = fileLineCounts.get(file);
114
+ if (maxLines !== undefined) {
115
+ return { valid: line >= 1 && line <= maxLines };
116
+ }
117
+ return { valid: false };
118
+ },
119
+ };
120
+ }
121
+ // =============================================================================
122
+ // Scenario Runner
123
+ // =============================================================================
124
+ const DEFAULT_TIMEOUT_MS = 30_000;
125
+ export function getUnsupportedScenarioReason(scenario) {
126
+ if (scenario.pattern === 'B') {
127
+ return 'framework-convention fixtures require mocked LLM behavior';
128
+ }
129
+ if (scenario.pattern === 'C') {
130
+ return 'project-context fixtures require project rules to be consumed by an LLM-backed adapter';
131
+ }
132
+ if (scenario.pattern === 'D') {
133
+ return 'PR-description fixtures require PR description context to be consumed by an LLM-backed adapter';
134
+ }
135
+ if (scenario.pattern === 'F') {
136
+ return 'mixed fixtures include LLM-dependent behavior that this deterministic adapter does not exercise';
137
+ }
138
+ if (!DETERMINISTIC_PATTERNS.has(scenario.pattern)) {
139
+ return `pattern ${scenario.pattern} is not supported by the deterministic benchmark adapter`;
140
+ }
141
+ return null;
142
+ }
143
+ /**
144
+ * Run a single benchmark scenario through the deterministic analysis pipeline.
145
+ *
146
+ * For each file in the diff:
147
+ * 1. Parse TypeScript source with ts.createSourceFile
148
+ * 2. Run VulnerabilityDetector.detectInFile to find potential vulnerabilities
149
+ * 3. Convert PotentialVulnerability[] to Finding[]
150
+ * 4. If Pattern E, also run finding-validator
151
+ * 5. Apply timeout via Promise.race
152
+ */
153
+ export async function runScenario(scenario, timeout = DEFAULT_TIMEOUT_MS) {
154
+ const unsupportedReason = getUnsupportedScenarioReason(scenario);
155
+ if (unsupportedReason) {
156
+ throw new Error(`Scenario ${scenario.id} is unsupported by the deterministic benchmark adapter: ${unsupportedReason}`);
157
+ }
158
+ const runAnalysis = async () => {
159
+ const diffFiles = parseDiffFiles(scenario.diff);
160
+ if (diffFiles.length === 0) {
161
+ return [];
162
+ }
163
+ const logger = createLogger({ minLevel: 'warn', consoleOutput: false });
164
+ const detector = new VulnerabilityDetector(logger);
165
+ const allFindings = [];
166
+ for (const { path, content } of diffFiles) {
167
+ // Determine script kind from file extension
168
+ const ext = path.split('.').pop()?.toLowerCase();
169
+ const scriptKind = ext === 'tsx'
170
+ ? ts.ScriptKind.TSX
171
+ : ext === 'jsx'
172
+ ? ts.ScriptKind.JSX
173
+ : ext === 'js'
174
+ ? ts.ScriptKind.JS
175
+ : ts.ScriptKind.TS;
176
+ const sourceFile = ts.createSourceFile(path, content, ts.ScriptTarget.ES2022, true, scriptKind);
177
+ // Detect vulnerabilities
178
+ const vulnerabilities = detector.detectInFile(sourceFile, path);
179
+ // Convert PotentialVulnerability to Finding
180
+ for (const vuln of vulnerabilities) {
181
+ allFindings.push({
182
+ severity: 'warning',
183
+ file: vuln.sinkLocation.file,
184
+ line: vuln.sinkLocation.line,
185
+ endLine: vuln.sinkLocation.endLine,
186
+ message: vuln.description,
187
+ ruleId: `cfa/${vuln.type}`,
188
+ sourceAgent: 'control_flow',
189
+ });
190
+ }
191
+ }
192
+ // Pattern E: Run finding-validator on synthetic findings (self-contradiction,
193
+ // stale lines, etc.). The control-flow detector doesn't emit these shapes,
194
+ // so we must inject them from the fixture to actually exercise the validator.
195
+ if (scenario.pattern === 'E') {
196
+ const inputFindings = scenario.syntheticFindings ?? allFindings;
197
+ const lineResolver = createBenchmarkLineResolver(diffFiles, scenario.diff);
198
+ const diffFilePaths = diffFiles.map((df) => df.path);
199
+ const summary = validateFindings(inputFindings, lineResolver, diffFilePaths);
200
+ return summary.validFindings;
201
+ }
202
+ return allFindings;
203
+ };
204
+ // Apply timeout with cleanup to prevent leaked timers
205
+ let timer;
206
+ const timeoutPromise = new Promise((resolve) => {
207
+ timer = setTimeout(() => {
208
+ console.log(`[benchmark] scenario ${scenario.id} timed out after ${timeout}ms`);
209
+ resolve([]);
210
+ }, timeout);
211
+ });
212
+ try {
213
+ return await Promise.race([runAnalysis(), timeoutPromise]);
214
+ }
215
+ finally {
216
+ if (timer)
217
+ clearTimeout(timer);
218
+ }
219
+ }
220
+ // =============================================================================
221
+ // Snapshot Adapter (FR-020, FR-021)
222
+ // =============================================================================
223
+ /** Version tag embedded in snapshot metadata for format compatibility checks. */
224
+ export const SNAPSHOT_ADAPTER_VERSION = '1.0.0';
225
+ /** Compute SHA-256 hash of a string, returned as hex. */
226
+ export function sha256(content) {
227
+ return createHash('sha256').update(content).digest('hex');
228
+ }
229
+ /** Snapshot file name for a given scenario ID. */
230
+ function snapshotFileName(scenarioId) {
231
+ return `${scenarioId}.snapshot.json`;
232
+ }
233
+ /**
234
+ * Load a recorded snapshot for a given scenario ID.
235
+ * Returns undefined if no snapshot file exists.
236
+ */
237
+ export async function loadSnapshot(scenarioId, snapshotDir) {
238
+ const filePath = join(snapshotDir, snapshotFileName(scenarioId));
239
+ try {
240
+ const content = await readFile(filePath, 'utf-8');
241
+ return JSON.parse(content);
242
+ }
243
+ catch {
244
+ return undefined;
245
+ }
246
+ }
247
+ /**
248
+ * Validate snapshot metadata against current system state.
249
+ * Returns drift details if any metadata field has changed.
250
+ */
251
+ export function validateSnapshotMetadata(snapshot, currentPromptHash, currentFixtureHash) {
252
+ const drifted = [];
253
+ if (snapshot.metadata.promptTemplateHash !== currentPromptHash) {
254
+ drifted.push({
255
+ field: 'promptTemplateHash',
256
+ expected: currentPromptHash,
257
+ actual: snapshot.metadata.promptTemplateHash,
258
+ });
259
+ }
260
+ if (snapshot.metadata.fixtureHash !== currentFixtureHash) {
261
+ drifted.push({
262
+ field: 'fixtureHash',
263
+ expected: currentFixtureHash,
264
+ actual: snapshot.metadata.fixtureHash,
265
+ });
266
+ }
267
+ return { valid: drifted.length === 0, drifted };
268
+ }
269
+ /**
270
+ * Run a benchmark scenario using a recorded snapshot.
271
+ * Validates metadata first; throws if snapshot not found or drift detected.
272
+ */
273
+ export async function runWithSnapshot(scenarioId, snapshotDir, currentPromptHash, currentFixtureHash) {
274
+ const snapshot = await loadSnapshot(scenarioId, snapshotDir);
275
+ if (!snapshot) {
276
+ throw new Error(`No snapshot found for scenario "${scenarioId}" in ${snapshotDir}. ` +
277
+ `Run with --record to capture a snapshot.`);
278
+ }
279
+ const driftCheck = validateSnapshotMetadata(snapshot, currentPromptHash, currentFixtureHash);
280
+ if (!driftCheck.valid) {
281
+ const details = driftCheck.drifted
282
+ .map((d) => ` ${d.field}: snapshot="${d.actual}" current="${d.expected}"`)
283
+ .join('\n');
284
+ throw new Error(`Snapshot drift detected for scenario "${scenarioId}":\n${details}\n` +
285
+ `Re-record with --record to update.`);
286
+ }
287
+ return snapshot.response.findings;
288
+ }
289
+ /**
290
+ * Record a live LLM response as a snapshot for a given scenario.
291
+ */
292
+ export async function recordSnapshot(scenarioId, response, metadata, snapshotDir) {
293
+ await mkdir(snapshotDir, { recursive: true });
294
+ const snapshot = { metadata, response };
295
+ const filePath = join(snapshotDir, snapshotFileName(scenarioId));
296
+ await writeFile(filePath, JSON.stringify(snapshot, null, 2), 'utf-8');
297
+ }
298
+ //# sourceMappingURL=adapter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../src/benchmark/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,MAAM,YAAY,CAAC;AAC5B,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,qBAAqB,EAAE,MAAM,kDAAkD,CAAC;AACzF,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,YAAY,EAAE,MAAM,kCAAkC,CAAC;AAIhE,MAAM,sBAAsB,GAAG,IAAI,GAAG,CAA+B,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC;AAWjF;;;GAGG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,MAAM,KAAK,GAAoB,EAAE,CAAC;IAClC,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAE/D,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE,CAAC;QAC/B,8CAA8C;QAC9C,MAAM,WAAW,GAAG,KAAK,CAAC,KAAK,CAAC,oBAAoB,CAAC,CAAC;QACtD,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC;YAAE,SAAS;QAEhC,MAAM,QAAQ,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC;QAChC,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAEhC,kDAAkD;QAClD,MAAM,YAAY,GAAa,EAAE,CAAC;QAClC,IAAI,MAAM,GAAG,KAAK,CAAC;QAEnB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC1B,MAAM,GAAG,IAAI,CAAC;gBACd,SAAS;YACX,CAAC;YACD,IAAI,CAAC,MAAM;gBAAE,SAAS;YAEtB,qBAAqB;YACrB,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YAEnC,kCAAkC;YAClC,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;gBACzB,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YACnC,CAAC;iBAAM,CAAC;gBACN,4CAA4C;gBAC5C,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YACjE,CAAC;QACH,CAAC;QAED,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACnE,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,gFAAgF;AAChF,6CAA6C;AAC7C,gFAAgF;AAEhF;;;GAGG;AACH,SAAS,2BAA2B,CAClC,SAA0B,EAC1B,OAAe;IAEf,2DAA2D;IAC3D,MAAM,cAAc,GAAG,IAAI,GAAG,EAAkB,CAAC;IACjD,KAAK,MAAM,EAAE,IAAI,SAAS,EAAE,CAAC;QAC3B,cAAc,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,CAAC;IAED,kEAAkE;IAClE,MAAM,cAAc,GAAG,IAAI,GAAG,EAA8C,CAAC;IAC7E,IAAI,WAAW,GAAG,EAAE,CAAC;IACrB,KAAK,MAAM,IAAI,IAAI,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;QACvC,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,iBAAiB,CAAC,CAAC;QAChD,IAAI,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YACnB,WAAW,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;YAC3B,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,WAAW,CAAC,EAAE,CAAC;gBACrC,cAAc,CAAC,GAAG,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;YACtC,CAAC;YACD,SAAS;QACX,CAAC;QACD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,yCAAyC,CAAC,CAAC;QACxE,IAAI,SAAS,EAAE,CAAC,CAAC,CAAC,IAAI,WAAW,EAAE,CAAC;YAClC,MAAM,KAAK,GAAG,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACzC,MAAM,KAAK,GAAG,SAAS,CAAC,CAAC,CAAC,KAAK,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC1E,cAAc,CAAC,GAAG,CAAC,WAAW,CAAC,EAAE,IAAI,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;QAC1D,CAAC;IACH,CAAC;IAED,OAAO;QACL,YAAY,CAAC,IAAY,EAAE,IAAwB;YACjD,IAAI,IAAI,KAAK,SAAS;gBAAE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC;YAE/C,4BAA4B;YAC5B,MAAM,MAAM,GAAG,cAAc,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACxC,IAAI,MAAM,EAAE,CAAC;gBACX,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;oBAC3B,IAAI,IAAI,IAAI,KAAK,CAAC,KAAK,IAAI,IAAI,GAAG,KAAK,CAAC,KAAK,GAAG,KAAK,CAAC,KAAK,EAAE,CAAC;wBAC5D,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC;oBACzB,CAAC;gBACH,CAAC;gBACD,6BAA6B;gBAC7B,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;YAC1B,CAAC;YAED,0CAA0C;YAC1C,MAAM,QAAQ,GAAG,cAAc,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YAC1C,IAAI,QAAQ,KAAK,SAAS,EAAE,CAAC;gBAC3B,OAAO,EAAE,KAAK,EAAE,IAAI,IAAI,CAAC,IAAI,IAAI,IAAI,QAAQ,EAAE,CAAC;YAClD,CAAC;YAED,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;QAC1B,CAAC;KACF,CAAC;AACJ,CAAC;AAED,gFAAgF;AAChF,kBAAkB;AAClB,gFAAgF;AAEhF,MAAM,kBAAkB,GAAG,MAAM,CAAC;AAElC,MAAM,UAAU,4BAA4B,CAAC,QAA2B;IACtE,IAAI,QAAQ,CAAC,OAAO,KAAK,GAAG,EAAE,CAAC;QAC7B,OAAO,2DAA2D,CAAC;IACrE,CAAC;IACD,IAAI,QAAQ,CAAC,OAAO,KAAK,GAAG,EAAE,CAAC;QAC7B,OAAO,wFAAwF,CAAC;IAClG,CAAC;IACD,IAAI,QAAQ,CAAC,OAAO,KAAK,GAAG,EAAE,CAAC;QAC7B,OAAO,gGAAgG,CAAC;IAC1G,CAAC;IACD,IAAI,QAAQ,CAAC,OAAO,KAAK,GAAG,EAAE,CAAC;QAC7B,OAAO,iGAAiG,CAAC;IAC3G,CAAC;IACD,IAAI,CAAC,sBAAsB,CAAC,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;QAClD,OAAO,WAAW,QAAQ,CAAC,OAAO,0DAA0D,CAAC;IAC/F,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,QAA2B,EAC3B,OAAO,GAAG,kBAAkB;IAE5B,MAAM,iBAAiB,GAAG,4BAA4B,CAAC,QAAQ,CAAC,CAAC;IACjE,IAAI,iBAAiB,EAAE,CAAC;QACtB,MAAM,IAAI,KAAK,CACb,YAAY,QAAQ,CAAC,EAAE,2DAA2D,iBAAiB,EAAE,CACtG,CAAC;IACJ,CAAC;IAED,MAAM,WAAW,GAAG,KAAK,IAAwB,EAAE;QACjD,MAAM,SAAS,GAAG,cAAc,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAEhD,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3B,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,MAAM,MAAM,GAAG,YAAY,CAAC,EAAE,QAAQ,EAAE,MAAM,EAAE,aAAa,EAAE,KAAK,EAAE,CAAC,CAAC;QACxE,MAAM,QAAQ,GAAG,IAAI,qBAAqB,CAAC,MAAM,CAAC,CAAC;QACnD,MAAM,WAAW,GAAc,EAAE,CAAC;QAElC,KAAK,MAAM,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,SAAS,EAAE,CAAC;YAC1C,4CAA4C;YAC5C,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,WAAW,EAAE,CAAC;YACjD,MAAM,UAAU,GACd,GAAG,KAAK,KAAK;gBACX,CAAC,CAAC,EAAE,CAAC,UAAU,CAAC,GAAG;gBACnB,CAAC,CAAC,GAAG,KAAK,KAAK;oBACb,CAAC,CAAC,EAAE,CAAC,UAAU,CAAC,GAAG;oBACnB,CAAC,CAAC,GAAG,KAAK,IAAI;wBACZ,CAAC,CAAC,EAAE,CAAC,UAAU,CAAC,EAAE;wBAClB,CAAC,CAAC,EAAE,CAAC,UAAU,CAAC,EAAE,CAAC;YAE3B,MAAM,UAAU,GAAG,EAAE,CAAC,gBAAgB,CACpC,IAAI,EACJ,OAAO,EACP,EAAE,CAAC,YAAY,CAAC,MAAM,EACtB,IAAI,EACJ,UAAU,CACX,CAAC;YAEF,yBAAyB;YACzB,MAAM,eAAe,GAAG,QAAQ,CAAC,YAAY,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;YAEhE,4CAA4C;YAC5C,KAAK,MAAM,IAAI,IAAI,eAAe,EAAE,CAAC;gBACnC,WAAW,CAAC,IAAI,CAAC;oBACf,QAAQ,EAAE,SAAS;oBACnB,IAAI,EAAE,IAAI,CAAC,YAAY,CAAC,IAAI;oBAC5B,IAAI,EAAE,IAAI,CAAC,YAAY,CAAC,IAAI;oBAC5B,OAAO,EAAE,IAAI,CAAC,YAAY,CAAC,OAAO;oBAClC,OAAO,EAAE,IAAI,CAAC,WAAW;oBACzB,MAAM,EAAE,OAAO,IAAI,CAAC,IAAI,EAAE;oBAC1B,WAAW,EAAE,cAAc;iBAC5B,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,8EAA8E;QAC9E,2EAA2E;QAC3E,8EAA8E;QAC9E,IAAI,QAAQ,CAAC,OAAO,KAAK,GAAG,EAAE,CAAC;YAC7B,MAAM,aAAa,GAAG,QAAQ,CAAC,iBAAiB,IAAI,WAAW,CAAC;YAChE,MAAM,YAAY,GAAG,2BAA2B,CAAC,SAAS,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;YAC3E,MAAM,aAAa,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC;YACrD,MAAM,OAAO,GAAG,gBAAgB,CAAC,aAAa,EAAE,YAAY,EAAE,aAAa,CAAC,CAAC;YAC7E,OAAO,OAAO,CAAC,aAAa,CAAC;QAC/B,CAAC;QAED,OAAO,WAAW,CAAC;IACrB,CAAC,CAAC;IAEF,sDAAsD;IACtD,IAAI,KAAgD,CAAC;IACrD,MAAM,cAAc,GAAG,IAAI,OAAO,CAAY,CAAC,OAAO,EAAE,EAAE;QACxD,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE;YACtB,OAAO,CAAC,GAAG,CAAC,wBAAwB,QAAQ,CAAC,EAAE,oBAAoB,OAAO,IAAI,CAAC,CAAC;YAChF,OAAO,CAAC,EAAE,CAAC,CAAC;QACd,CAAC,EAAE,OAAO,CAAC,CAAC;IACd,CAAC,CAAC,CAAC;IAEH,IAAI,CAAC;QACH,OAAO,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC,WAAW,EAAE,EAAE,cAAc,CAAC,CAAC,CAAC;IAC7D,CAAC;YAAS,CAAC;QACT,IAAI,KAAK;YAAE,YAAY,CAAC,KAAK,CAAC,CAAC;IACjC,CAAC;AACH,CAAC;AAED,gFAAgF;AAChF,oCAAoC;AACpC,gFAAgF;AAEhF,iFAAiF;AACjF,MAAM,CAAC,MAAM,wBAAwB,GAAG,OAAO,CAAC;AAgChD,yDAAyD;AACzD,MAAM,UAAU,MAAM,CAAC,OAAe;IACpC,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAC5D,CAAC;AAED,kDAAkD;AAClD,SAAS,gBAAgB,CAAC,UAAkB;IAC1C,OAAO,GAAG,UAAU,gBAAgB,CAAC;AACvC,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,UAAkB,EAClB,WAAmB;IAEnB,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,EAAE,gBAAgB,CAAC,UAAU,CAAC,CAAC,CAAC;IACjE,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAClD,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAqB,CAAC;IACjD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,wBAAwB,CACtC,QAA0B,EAC1B,iBAAyB,EACzB,kBAA0B;IAE1B,MAAM,OAAO,GAAiB,EAAE,CAAC;IAEjC,IAAI,QAAQ,CAAC,QAAQ,CAAC,kBAAkB,KAAK,iBAAiB,EAAE,CAAC;QAC/D,OAAO,CAAC,IAAI,CAAC;YACX,KAAK,EAAE,oBAAoB;YAC3B,QAAQ,EAAE,iBAAiB;YAC3B,MAAM,EAAE,QAAQ,CAAC,QAAQ,CAAC,kBAAkB;SAC7C,CAAC,CAAC;IACL,CAAC;IAED,IAAI,QAAQ,CAAC,QAAQ,CAAC,WAAW,KAAK,kBAAkB,EAAE,CAAC;QACzD,OAAO,CAAC,IAAI,CAAC;YACX,KAAK,EAAE,aAAa;YACpB,QAAQ,EAAE,kBAAkB;YAC5B,MAAM,EAAE,QAAQ,CAAC,QAAQ,CAAC,WAAW;SACtC,CAAC,CAAC;IACL,CAAC;IAED,OAAO,EAAE,KAAK,EAAE,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,OAAO,EAAE,CAAC;AAClD,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,UAAkB,EAClB,WAAmB,EACnB,iBAAyB,EACzB,kBAA0B;IAE1B,MAAM,QAAQ,GAAG,MAAM,YAAY,CAAC,UAAU,EAAE,WAAW,CAAC,CAAC;IAC7D,IAAI,CAAC,QAAQ,EAAE,CAAC;QACd,MAAM,IAAI,KAAK,CACb,mCAAmC,UAAU,QAAQ,WAAW,IAAI;YAClE,0CAA0C,CAC7C,CAAC;IACJ,CAAC;IAED,MAAM,UAAU,GAAG,wBAAwB,CAAC,QAAQ,EAAE,iBAAiB,EAAE,kBAAkB,CAAC,CAAC;IAC7F,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC;QACtB,MAAM,OAAO,GAAG,UAAU,CAAC,OAAO;aAC/B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC,KAAK,eAAe,CAAC,CAAC,MAAM,cAAc,CAAC,CAAC,QAAQ,GAAG,CAAC;aAC1E,IAAI,CAAC,IAAI,CAAC,CAAC;QACd,MAAM,IAAI,KAAK,CACb,yCAAyC,UAAU,OAAO,OAAO,IAAI;YACnE,oCAAoC,CACvC,CAAC;IACJ,CAAC;IAED,OAAO,QAAQ,CAAC,QAAQ,CAAC,QAAQ,CAAC;AACpC,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,UAAkB,EAClB,QAA0B,EAC1B,QAA0B,EAC1B,WAAmB;IAEnB,MAAM,KAAK,CAAC,WAAW,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC9C,MAAM,QAAQ,GAAqB,EAAE,QAAQ,EAAE,QAAQ,EAAE,CAAC;IAC1D,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,EAAE,gBAAgB,CAAC,UAAU,CAAC,CAAC,CAAC;IACjE,MAAM,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;AACxE,CAAC"}
@@ -0,0 +1,100 @@
1
+ /**
2
+ * Benchmark Scoring Module
3
+ *
4
+ * Types and scoring functions for the false-positive regression benchmark.
5
+ * Implements dual-pool scoring (FP suppression rate + TP recall/precision)
6
+ * per benchmark-scenario.md contract.
7
+ */
8
+ import type { Finding } from '../agents/types.js';
9
+ export interface BenchmarkScenario {
10
+ id: string;
11
+ category: string;
12
+ pattern: 'A' | 'B' | 'C' | 'D' | 'E' | 'F';
13
+ description: string;
14
+ sourceIssue: string;
15
+ diff: string;
16
+ config?: Record<string, unknown>;
17
+ prDescription?: string;
18
+ projectRules?: string;
19
+ expectedFindings: ExpectedFinding[];
20
+ truePositive: boolean;
21
+ subcategory?: string;
22
+ source?: string;
23
+ /** Synthetic findings injected before validation (Pattern E scenarios). */
24
+ syntheticFindings?: Finding[];
25
+ }
26
+ export interface ExpectedFinding {
27
+ file: string;
28
+ line?: number;
29
+ severityAtLeast?: string;
30
+ messageContains?: string;
31
+ ruleId?: string;
32
+ }
33
+ export interface FPRegressionPool {
34
+ total: number;
35
+ trueNegatives: number;
36
+ falsePositives: number;
37
+ suppressionRate: number;
38
+ fpRate: number;
39
+ }
40
+ export interface TPPreservationPool {
41
+ total: number;
42
+ truePositives: number;
43
+ falseNegatives: number;
44
+ extraneous: number;
45
+ recall: number;
46
+ precision: number;
47
+ }
48
+ export interface ScenarioResult {
49
+ id: string;
50
+ passed: boolean;
51
+ category: string;
52
+ pattern: string;
53
+ truePositive: boolean;
54
+ actualFindings: Finding[];
55
+ expectedFindings: ExpectedFinding[];
56
+ matchedCount: number;
57
+ unmatchedExpected: ExpectedFinding[];
58
+ extraneousFindings: Finding[];
59
+ timedOut: boolean;
60
+ }
61
+ export interface BenchmarkReport {
62
+ schemaVersion: string;
63
+ timestamp: string;
64
+ totalScenarios: number;
65
+ pool1: FPRegressionPool;
66
+ pool2: TPPreservationPool;
67
+ byCategory: Record<string, {
68
+ total: number;
69
+ passed: number;
70
+ failed: number;
71
+ }>;
72
+ scenarios: ScenarioResult[];
73
+ }
74
+ /**
75
+ * Check if an actual finding matches an expected finding.
76
+ * File match is required; severity, message, and ruleId are optional constraints.
77
+ */
78
+ export declare function matchFinding(expected: ExpectedFinding, actual: Finding): boolean;
79
+ /**
80
+ * 1:1 strict matching of expected findings against actual findings.
81
+ * Sort expected by specificity (most fields first), then consume matched actuals.
82
+ */
83
+ export declare function matchFindings(expected: ExpectedFinding[], actual: Finding[]): {
84
+ matched: number;
85
+ unmatchedExpected: ExpectedFinding[];
86
+ extraneous: Finding[];
87
+ };
88
+ /**
89
+ * Score a single benchmark scenario.
90
+ *
91
+ * - FP scenario (truePositive: false): passed = actualFindings.length === 0
92
+ * - TP scenario (truePositive: true): passed = all expectedFindings matched
93
+ */
94
+ export declare function scoreScenario(scenario: BenchmarkScenario, actualFindings: Finding[], timedOut?: boolean): ScenarioResult;
95
+ /**
96
+ * Compute the aggregate benchmark report from individual scenario results.
97
+ * Pool 1 (FP) and Pool 2 (TP) are scored independently.
98
+ */
99
+ export declare function computeReport(results: ScenarioResult[]): BenchmarkReport;
100
+ //# sourceMappingURL=scoring.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"scoring.d.ts","sourceRoot":"","sources":["../../src/benchmark/scoring.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,oBAAoB,CAAC;AAoBlD,MAAM,WAAW,iBAAiB;IAChC,EAAE,EAAE,MAAM,CAAC;IACX,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,GAAG,GAAG,GAAG,GAAG,GAAG,GAAG,GAAG,GAAG,GAAG,GAAG,GAAG,CAAC;IAC3C,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACjC,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,gBAAgB,EAAE,eAAe,EAAE,CAAC;IACpC,YAAY,EAAE,OAAO,CAAC;IACtB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,2EAA2E;IAC3E,iBAAiB,CAAC,EAAE,OAAO,EAAE,CAAC;CAC/B;AAED,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,gBAAgB;IAC/B,KAAK,EAAE,MAAM,CAAC;IACd,aAAa,EAAE,MAAM,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;IACvB,eAAe,EAAE,MAAM,CAAC;IACxB,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,kBAAkB;IACjC,KAAK,EAAE,MAAM,CAAC;IACd,aAAa,EAAE,MAAM,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;IACvB,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,cAAc;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,OAAO,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,OAAO,CAAC;IACtB,cAAc,EAAE,OAAO,EAAE,CAAC;IAC1B,gBAAgB,EAAE,eAAe,EAAE,CAAC;IACpC,YAAY,EAAE,MAAM,CAAC;IACrB,iBAAiB,EAAE,eAAe,EAAE,CAAC;IACrC,kBAAkB,EAAE,OAAO,EAAE,CAAC;IAC9B,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED,MAAM,WAAW,eAAe;IAC9B,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,cAAc,EAAE,MAAM,CAAC;IACvB,KAAK,EAAE,gBAAgB,CAAC;IACxB,KAAK,EAAE,kBAAkB,CAAC;IAC1B,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC9E,SAAS,EAAE,cAAc,EAAE,CAAC;CAC7B;AAMD;;;GAGG;AACH,wBAAgB,YAAY,CAAC,QAAQ,EAAE,eAAe,EAAE,MAAM,EAAE,OAAO,GAAG,OAAO,CAwBhF;AAeD;;;GAGG;AACH,wBAAgB,aAAa,CAC3B,QAAQ,EAAE,eAAe,EAAE,EAC3B,MAAM,EAAE,OAAO,EAAE,GAChB;IACD,OAAO,EAAE,MAAM,CAAC;IAChB,iBAAiB,EAAE,eAAe,EAAE,CAAC;IACrC,UAAU,EAAE,OAAO,EAAE,CAAC;CACvB,CA4BA;AAMD;;;;;GAKG;AACH,wBAAgB,aAAa,CAC3B,QAAQ,EAAE,iBAAiB,EAC3B,cAAc,EAAE,OAAO,EAAE,EACzB,QAAQ,UAAQ,GACf,cAAc,CAqChB;AAED;;;GAGG;AACH,wBAAgB,aAAa,CAAC,OAAO,EAAE,cAAc,EAAE,GAAG,eAAe,CAuDxE"}
@@ -0,0 +1,195 @@
1
+ /**
2
+ * Benchmark Scoring Module
3
+ *
4
+ * Types and scoring functions for the false-positive regression benchmark.
5
+ * Implements dual-pool scoring (FP suppression rate + TP recall/precision)
6
+ * per benchmark-scenario.md contract.
7
+ */
8
+ // =============================================================================
9
+ // Severity Ranking
10
+ // =============================================================================
11
+ const SEVERITY_RANK = {
12
+ info: 1,
13
+ low: 2,
14
+ medium: 3,
15
+ high: 4,
16
+ critical: 5,
17
+ error: 5,
18
+ warning: 3,
19
+ };
20
+ // =============================================================================
21
+ // Matching Functions
22
+ // =============================================================================
23
+ /**
24
+ * Check if an actual finding matches an expected finding.
25
+ * File match is required; severity, message, and ruleId are optional constraints.
26
+ */
27
+ export function matchFinding(expected, actual) {
28
+ // File match required
29
+ if (actual.file !== expected.file)
30
+ return false;
31
+ // Line match (if specified)
32
+ if (expected.line !== undefined && actual.line !== expected.line)
33
+ return false;
34
+ // Severity match (if specified): actual severity must be >= expected minimum
35
+ if (expected.severityAtLeast !== undefined) {
36
+ const actualRank = SEVERITY_RANK[actual.severity] ?? 0;
37
+ const expectedRank = SEVERITY_RANK[expected.severityAtLeast] ?? 0;
38
+ if (actualRank < expectedRank)
39
+ return false;
40
+ }
41
+ // Message match (if specified): actual message must contain expected substring
42
+ if (expected.messageContains !== undefined) {
43
+ if (!actual.message.toLowerCase().includes(expected.messageContains.toLowerCase()))
44
+ return false;
45
+ }
46
+ // Rule match (if specified)
47
+ if (expected.ruleId !== undefined && actual.ruleId !== expected.ruleId)
48
+ return false;
49
+ return true;
50
+ }
51
+ /**
52
+ * Count the number of defined optional fields on an ExpectedFinding.
53
+ * Used to sort by specificity (most fields first).
54
+ */
55
+ function specificityScore(ef) {
56
+ let score = 0;
57
+ if (ef.line !== undefined)
58
+ score++;
59
+ if (ef.severityAtLeast !== undefined)
60
+ score++;
61
+ if (ef.messageContains !== undefined)
62
+ score++;
63
+ if (ef.ruleId !== undefined)
64
+ score++;
65
+ return score;
66
+ }
67
+ /**
68
+ * 1:1 strict matching of expected findings against actual findings.
69
+ * Sort expected by specificity (most fields first), then consume matched actuals.
70
+ */
71
+ export function matchFindings(expected, actual) {
72
+ // Sort expected by specificity descending (most specific matched first)
73
+ const sortedExpected = [...expected].sort((a, b) => specificityScore(b) - specificityScore(a));
74
+ const consumed = new Set();
75
+ const unmatchedExpected = [];
76
+ let matched = 0;
77
+ for (const exp of sortedExpected) {
78
+ let found = false;
79
+ for (let i = 0; i < actual.length; i++) {
80
+ if (consumed.has(i))
81
+ continue;
82
+ const actualFinding = actual[i];
83
+ if (actualFinding && matchFinding(exp, actualFinding)) {
84
+ consumed.add(i);
85
+ matched++;
86
+ found = true;
87
+ break;
88
+ }
89
+ }
90
+ if (!found) {
91
+ unmatchedExpected.push(exp);
92
+ }
93
+ }
94
+ const extraneous = actual.filter((_, i) => !consumed.has(i));
95
+ return { matched, unmatchedExpected, extraneous };
96
+ }
97
+ // =============================================================================
98
+ // Scoring Functions
99
+ // =============================================================================
100
+ /**
101
+ * Score a single benchmark scenario.
102
+ *
103
+ * - FP scenario (truePositive: false): passed = actualFindings.length === 0
104
+ * - TP scenario (truePositive: true): passed = all expectedFindings matched
105
+ */
106
+ export function scoreScenario(scenario, actualFindings, timedOut = false) {
107
+ if (!scenario.truePositive) {
108
+ // FP scenario: no findings expected
109
+ return {
110
+ id: scenario.id,
111
+ passed: actualFindings.length === 0,
112
+ category: scenario.category,
113
+ pattern: scenario.pattern,
114
+ truePositive: false,
115
+ actualFindings,
116
+ expectedFindings: scenario.expectedFindings,
117
+ matchedCount: 0,
118
+ unmatchedExpected: [],
119
+ extraneousFindings: actualFindings,
120
+ timedOut,
121
+ };
122
+ }
123
+ // TP scenario: all expected findings must match
124
+ const { matched, unmatchedExpected, extraneous } = matchFindings(scenario.expectedFindings, actualFindings);
125
+ return {
126
+ id: scenario.id,
127
+ passed: unmatchedExpected.length === 0,
128
+ category: scenario.category,
129
+ pattern: scenario.pattern,
130
+ truePositive: true,
131
+ actualFindings,
132
+ expectedFindings: scenario.expectedFindings,
133
+ matchedCount: matched,
134
+ unmatchedExpected,
135
+ extraneousFindings: extraneous,
136
+ timedOut,
137
+ };
138
+ }
139
+ /**
140
+ * Compute the aggregate benchmark report from individual scenario results.
141
+ * Pool 1 (FP) and Pool 2 (TP) are scored independently.
142
+ */
143
+ export function computeReport(results) {
144
+ const fpResults = results.filter((r) => !r.truePositive);
145
+ const tpResults = results.filter((r) => r.truePositive);
146
+ // Pool 1: FP Regression
147
+ const trueNegatives = fpResults.filter((r) => r.passed).length;
148
+ const falsePositives = fpResults.length - trueNegatives;
149
+ const suppressionRate = fpResults.length > 0 ? trueNegatives / fpResults.length : 1;
150
+ // Pool 2: TP Preservation
151
+ const totalTpExpected = tpResults.reduce((sum, r) => sum + r.expectedFindings.length, 0);
152
+ const totalTpMatched = tpResults.reduce((sum, r) => sum + r.matchedCount, 0);
153
+ const totalTpFN = totalTpExpected - totalTpMatched;
154
+ const totalExtraneous = tpResults.reduce((sum, r) => sum + r.extraneousFindings.length, 0);
155
+ const recall = totalTpExpected > 0 ? totalTpMatched / totalTpExpected : 1;
156
+ const precision = totalTpMatched + totalExtraneous > 0 ? totalTpMatched / (totalTpMatched + totalExtraneous) : 1;
157
+ // By category
158
+ const byCategory = {};
159
+ for (const r of results) {
160
+ const key = r.category;
161
+ if (!byCategory[key]) {
162
+ byCategory[key] = { total: 0, passed: 0, failed: 0 };
163
+ }
164
+ byCategory[key].total++;
165
+ if (r.passed) {
166
+ byCategory[key].passed++;
167
+ }
168
+ else {
169
+ byCategory[key].failed++;
170
+ }
171
+ }
172
+ return {
173
+ schemaVersion: '1.0.0',
174
+ timestamp: new Date().toISOString(),
175
+ totalScenarios: results.length,
176
+ pool1: {
177
+ total: fpResults.length,
178
+ trueNegatives,
179
+ falsePositives,
180
+ suppressionRate,
181
+ fpRate: 1 - suppressionRate,
182
+ },
183
+ pool2: {
184
+ total: tpResults.length,
185
+ truePositives: totalTpMatched,
186
+ falseNegatives: totalTpFN,
187
+ extraneous: totalExtraneous,
188
+ recall,
189
+ precision,
190
+ },
191
+ byCategory,
192
+ scenarios: results,
193
+ };
194
+ }
195
+ //# sourceMappingURL=scoring.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"scoring.js","sourceRoot":"","sources":["../../src/benchmark/scoring.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAIH,gFAAgF;AAChF,mBAAmB;AACnB,gFAAgF;AAEhF,MAAM,aAAa,GAA2B;IAC5C,IAAI,EAAE,CAAC;IACP,GAAG,EAAE,CAAC;IACN,MAAM,EAAE,CAAC;IACT,IAAI,EAAE,CAAC;IACP,QAAQ,EAAE,CAAC;IACX,KAAK,EAAE,CAAC;IACR,OAAO,EAAE,CAAC;CACX,CAAC;AAyEF,gFAAgF;AAChF,qBAAqB;AACrB,gFAAgF;AAEhF;;;GAGG;AACH,MAAM,UAAU,YAAY,CAAC,QAAyB,EAAE,MAAe;IACrE,sBAAsB;IACtB,IAAI,MAAM,CAAC,IAAI,KAAK,QAAQ,CAAC,IAAI;QAAE,OAAO,KAAK,CAAC;IAEhD,4BAA4B;IAC5B,IAAI,QAAQ,CAAC,IAAI,KAAK,SAAS,IAAI,MAAM,CAAC,IAAI,KAAK,QAAQ,CAAC,IAAI;QAAE,OAAO,KAAK,CAAC;IAE/E,6EAA6E;IAC7E,IAAI,QAAQ,CAAC,eAAe,KAAK,SAAS,EAAE,CAAC;QAC3C,MAAM,UAAU,GAAG,aAAa,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACvD,MAAM,YAAY,GAAG,aAAa,CAAC,QAAQ,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC;QAClE,IAAI,UAAU,GAAG,YAAY;YAAE,OAAO,KAAK,CAAC;IAC9C,CAAC;IAED,+EAA+E;IAC/E,IAAI,QAAQ,CAAC,eAAe,KAAK,SAAS,EAAE,CAAC;QAC3C,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,eAAe,CAAC,WAAW,EAAE,CAAC;YAChF,OAAO,KAAK,CAAC;IACjB,CAAC;IAED,4BAA4B;IAC5B,IAAI,QAAQ,CAAC,MAAM,KAAK,SAAS,IAAI,MAAM,CAAC,MAAM,KAAK,QAAQ,CAAC,MAAM;QAAE,OAAO,KAAK,CAAC;IAErF,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;GAGG;AACH,SAAS,gBAAgB,CAAC,EAAmB;IAC3C,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,IAAI,EAAE,CAAC,IAAI,KAAK,SAAS;QAAE,KAAK,EAAE,CAAC;IACnC,IAAI,EAAE,CAAC,eAAe,KAAK,SAAS;QAAE,KAAK,EAAE,CAAC;IAC9C,IAAI,EAAE,CAAC,eAAe,KAAK,SAAS;QAAE,KAAK,EAAE,CAAC;IAC9C,IAAI,EAAE,CAAC,MAAM,KAAK,SAAS;QAAE,KAAK,EAAE,CAAC;IACrC,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,aAAa,CAC3B,QAA2B,EAC3B,MAAiB;IAMjB,wEAAwE;IACxE,MAAM,cAAc,GAAG,CAAC,GAAG,QAAQ,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,gBAAgB,CAAC,CAAC,CAAC,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAC,CAAC;IAE/F,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAC;IACnC,MAAM,iBAAiB,GAAsB,EAAE,CAAC;IAChD,IAAI,OAAO,GAAG,CAAC,CAAC;IAEhB,KAAK,MAAM,GAAG,IAAI,cAAc,EAAE,CAAC;QACjC,IAAI,KAAK,GAAG,KAAK,CAAC;QAClB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,IAAI,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC;gBAAE,SAAS;YAC9B,MAAM,aAAa,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;YAChC,IAAI,aAAa,IAAI,YAAY,CAAC,GAAG,EAAE,aAAa,CAAC,EAAE,CAAC;gBACtD,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;gBAChB,OAAO,EAAE,CAAC;gBACV,KAAK,GAAG,IAAI,CAAC;gBACb,MAAM;YACR,CAAC;QACH,CAAC;QACD,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,iBAAiB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IAE7D,OAAO,EAAE,OAAO,EAAE,iBAAiB,EAAE,UAAU,EAAE,CAAC;AACpD,CAAC;AAED,gFAAgF;AAChF,oBAAoB;AACpB,gFAAgF;AAEhF;;;;;GAKG;AACH,MAAM,UAAU,aAAa,CAC3B,QAA2B,EAC3B,cAAyB,EACzB,QAAQ,GAAG,KAAK;IAEhB,IAAI,CAAC,QAAQ,CAAC,YAAY,EAAE,CAAC;QAC3B,oCAAoC;QACpC,OAAO;YACL,EAAE,EAAE,QAAQ,CAAC,EAAE;YACf,MAAM,EAAE,cAAc,CAAC,MAAM,KAAK,CAAC;YACnC,QAAQ,EAAE,QAAQ,CAAC,QAAQ;YAC3B,OAAO,EAAE,QAAQ,CAAC,OAAO;YACzB,YAAY,EAAE,KAAK;YACnB,cAAc;YACd,gBAAgB,EAAE,QAAQ,CAAC,gBAAgB;YAC3C,YAAY,EAAE,CAAC;YACf,iBAAiB,EAAE,EAAE;YACrB,kBAAkB,EAAE,cAAc;YAClC,QAAQ;SACT,CAAC;IACJ,CAAC;IAED,gDAAgD;IAChD,MAAM,EAAE,OAAO,EAAE,iBAAiB,EAAE,UAAU,EAAE,GAAG,aAAa,CAC9D,QAAQ,CAAC,gBAAgB,EACzB,cAAc,CACf,CAAC;IAEF,OAAO;QACL,EAAE,EAAE,QAAQ,CAAC,EAAE;QACf,MAAM,EAAE,iBAAiB,CAAC,MAAM,KAAK,CAAC;QACtC,QAAQ,EAAE,QAAQ,CAAC,QAAQ;QAC3B,OAAO,EAAE,QAAQ,CAAC,OAAO;QACzB,YAAY,EAAE,IAAI;QAClB,cAAc;QACd,gBAAgB,EAAE,QAAQ,CAAC,gBAAgB;QAC3C,YAAY,EAAE,OAAO;QACrB,iBAAiB;QACjB,kBAAkB,EAAE,UAAU;QAC9B,QAAQ;KACT,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,aAAa,CAAC,OAAyB;IACrD,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC;IACzD,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC;IAExD,wBAAwB;IACxB,MAAM,aAAa,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IAC/D,MAAM,cAAc,GAAG,SAAS,CAAC,MAAM,GAAG,aAAa,CAAC;IACxD,MAAM,eAAe,GAAG,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAEpF,0BAA0B;IAC1B,MAAM,eAAe,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,gBAAgB,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACzF,MAAM,cAAc,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC;IAC7E,MAAM,SAAS,GAAG,eAAe,GAAG,cAAc,CAAC;IACnD,MAAM,eAAe,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,kBAAkB,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAC3F,MAAM,MAAM,GAAG,eAAe,GAAG,CAAC,CAAC,CAAC,CAAC,cAAc,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC;IAC1E,MAAM,SAAS,GACb,cAAc,GAAG,eAAe,GAAG,CAAC,CAAC,CAAC,CAAC,cAAc,GAAG,CAAC,cAAc,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAEjG,cAAc;IACd,MAAM,UAAU,GAAsE,EAAE,CAAC;IACzF,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,MAAM,GAAG,GAAG,CAAC,CAAC,QAAQ,CAAC;QACvB,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YACrB,UAAU,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC;QACvD,CAAC;QACD,UAAU,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC;QACxB,IAAI,CAAC,CAAC,MAAM,EAAE,CAAC;YACb,UAAU,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC;QAC3B,CAAC;aAAM,CAAC;YACN,UAAU,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC;QAC3B,CAAC;IACH,CAAC;IAED,OAAO;QACL,aAAa,EAAE,OAAO;QACtB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,cAAc,EAAE,OAAO,CAAC,MAAM;QAC9B,KAAK,EAAE;YACL,KAAK,EAAE,SAAS,CAAC,MAAM;YACvB,aAAa;YACb,cAAc;YACd,eAAe;YACf,MAAM,EAAE,CAAC,GAAG,eAAe;SAC5B;QACD,KAAK,EAAE;YACL,KAAK,EAAE,SAAS,CAAC,MAAM;YACvB,aAAa,EAAE,cAAc;YAC7B,cAAc,EAAE,SAAS;YACzB,UAAU,EAAE,eAAe;YAC3B,MAAM;YACN,SAAS;SACV;QACD,UAAU;QACV,SAAS,EAAE,OAAO;KACnB,CAAC;AACJ,CAAC"}
@@ -15,8 +15,8 @@ export declare const PlatformSchema: z.ZodEnum<{
15
15
  * Schema for dependency check status.
16
16
  */
17
17
  export declare const DependencyStatusSchema: z.ZodEnum<{
18
- available: "available";
19
18
  missing: "missing";
19
+ available: "available";
20
20
  unhealthy: "unhealthy";
21
21
  "version-mismatch": "version-mismatch";
22
22
  }>;
@@ -26,8 +26,8 @@ export declare const DependencyStatusSchema: z.ZodEnum<{
26
26
  export declare const DependencyCheckResultSchema: z.ZodObject<{
27
27
  name: z.ZodString;
28
28
  status: z.ZodEnum<{
29
- available: "available";
30
29
  missing: "missing";
30
+ available: "available";
31
31
  unhealthy: "unhealthy";
32
32
  "version-mismatch": "version-mismatch";
33
33
  }>;
@@ -41,8 +41,8 @@ export declare const DependencyCheckSummarySchema: z.ZodObject<{
41
41
  results: z.ZodArray<z.ZodObject<{
42
42
  name: z.ZodString;
43
43
  status: z.ZodEnum<{
44
- available: "available";
45
44
  missing: "missing";
45
+ available: "available";
46
46
  unhealthy: "unhealthy";
47
47
  "version-mismatch": "version-mismatch";
48
48
  }>;