@kevinrabun/judges-cli 3.128.2 → 3.129.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.d.ts +1 -0
- package/dist/api.js +2 -0
- package/dist/cli-dispatch.js +2 -0
- package/dist/cli.js +2 -0
- package/dist/commands/external-benchmarks.d.ts +118 -0
- package/dist/commands/external-benchmarks.js +296 -0
- package/dist/commands/martian-code-review-benchmark.d.ts +61 -0
- package/dist/commands/martian-code-review-benchmark.js +516 -0
- package/dist/commands/openssf-cve-benchmark.d.ts +96 -0
- package/dist/commands/openssf-cve-benchmark.js +659 -0
- package/package.json +1 -1
|
@@ -0,0 +1,659 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenSSF CVE Benchmark Integration
|
|
3
|
+
*
|
|
4
|
+
* Runs the Judges evaluation engine against the OpenSSF CVE Benchmark dataset
|
|
5
|
+
* (https://github.com/ossf-cve-benchmark/ossf-cve-benchmark) — 200+ real-world
|
|
6
|
+
* JavaScript/TypeScript CVEs with pre-patch (vulnerable) and post-patch (fixed)
|
|
7
|
+
* git commits.
|
|
8
|
+
*
|
|
9
|
+
* Two modes:
|
|
10
|
+
* 1. Deterministic (L1): Runs Judges' pattern-based evaluators against each CVE.
|
|
11
|
+
* 2. LLM integration: Converts passing CVE cases into BenchmarkCase format
|
|
12
|
+
* for inclusion in the LLM benchmark pipeline.
|
|
13
|
+
*
|
|
14
|
+
* Usage:
|
|
15
|
+
* judges openssf-cve run [--repo <path>] [--cve <id>] [--format json|text|markdown]
|
|
16
|
+
* judges openssf-cve convert [--repo <path>] # Convert to BenchmarkCase[]
|
|
17
|
+
*/
|
|
18
|
+
import { existsSync, readFileSync, readdirSync, writeFileSync, mkdirSync } from "fs";
|
|
19
|
+
import { resolve, join, extname } from "path";
|
|
20
|
+
import { execSync } from "child_process";
|
|
21
|
+
import { evaluateWithTribunal } from "../evaluators/index.js";
|
|
22
|
+
import { EXT_TO_LANG } from "../ext-to-lang.js";
|
|
23
|
+
import { registerBenchmarkAdapter } from "./external-benchmarks.js";
|
|
24
|
+
// ─── CWE → Judge Prefix Mapping ────────────────────────────────────────────
|
|
25
|
+
/**
|
|
26
|
+
* Maps CWE IDs to the judge rule prefixes that are expected to detect them.
|
|
27
|
+
* This is the reverse of the PREFIX_MAP in security-ids.ts, extended with
|
|
28
|
+
* additional CWE coverage from individual judges.
|
|
29
|
+
*/
|
|
30
|
+
const CWE_TO_PREFIXES = {
|
|
31
|
+
// Injection
|
|
32
|
+
"CWE-078": ["CYBER", "SEC"], // OS Command Injection
|
|
33
|
+
"CWE-079": ["CYBER", "SEC", "XSS", "FW"], // XSS
|
|
34
|
+
"CWE-089": ["CYBER", "SEC", "DB"], // SQL Injection
|
|
35
|
+
"CWE-094": ["CYBER", "SEC"], // Code Injection
|
|
36
|
+
"CWE-095": ["CYBER", "SEC"], // Eval Injection
|
|
37
|
+
"CWE-917": ["CYBER", "SEC"], // Expression Language Injection
|
|
38
|
+
"CWE-134": ["CYBER", "SEC"], // Format String
|
|
39
|
+
"CWE-943": ["DB", "SEC"], // NoSQL Injection
|
|
40
|
+
// Path Traversal / File Access
|
|
41
|
+
"CWE-022": ["CYBER", "SEC"], // Path Traversal
|
|
42
|
+
"CWE-073": ["CYBER", "SEC"], // External Control of File Name
|
|
43
|
+
"CWE-434": ["CYBER", "SEC"], // Unrestricted Upload
|
|
44
|
+
// Auth / Crypto
|
|
45
|
+
"CWE-287": ["AUTH", "CYBER"], // Improper Authentication
|
|
46
|
+
"CWE-798": ["AUTH", "CFG", "DATA", "CYBER"], // Hard-coded Credentials
|
|
47
|
+
"CWE-327": ["CRYPTO", "CYBER"], // Use of Broken Crypto Algorithm
|
|
48
|
+
"CWE-328": ["CRYPTO", "CYBER"], // Weak Hash
|
|
49
|
+
"CWE-330": ["CRYPTO", "CYBER", "AICS"], // Insufficient Randomness
|
|
50
|
+
"CWE-916": ["CRYPTO", "AUTH", "CYBER"], // Weak Password Hash
|
|
51
|
+
// Access Control
|
|
52
|
+
"CWE-284": ["CYBER", "AUTH"], // Improper Access Control
|
|
53
|
+
"CWE-269": ["CYBER", "AUTH"], // Improper Privilege Management
|
|
54
|
+
"CWE-862": ["AUTH", "CYBER"], // Missing Authorization
|
|
55
|
+
"CWE-863": ["AUTH", "CYBER"], // Incorrect Authorization
|
|
56
|
+
// Data Exposure
|
|
57
|
+
"CWE-200": ["DATA", "SEC", "LOGPRIV"], // Information Exposure
|
|
58
|
+
"CWE-209": ["ERR", "SEC"], // Error Message Info Exposure
|
|
59
|
+
"CWE-312": ["DATA", "CFG"], // Cleartext Storage of Sensitive Info
|
|
60
|
+
"CWE-319": ["CYBER", "SEC", "DATA"], // Cleartext Transmission
|
|
61
|
+
"CWE-532": ["LOGPRIV"], // Insertion of Sensitive Info into Log
|
|
62
|
+
// Deserialization / Prototype Pollution
|
|
63
|
+
"CWE-502": ["CYBER", "SEC"], // Deserialization of Untrusted Data
|
|
64
|
+
"CWE-915": ["CYBER", "SEC"], // Improperly Controlled Modification (prototype pollution)
|
|
65
|
+
"CWE-471": ["CYBER", "SEC"], // Modification of Assumed-Immutable Data
|
|
66
|
+
// Input / Validation
|
|
67
|
+
"CWE-020": ["SEC", "CYBER"], // Improper Input Validation
|
|
68
|
+
"CWE-400": ["RATE", "CYBER"], // Uncontrolled Resource Consumption (ReDoS, etc.)
|
|
69
|
+
"CWE-770": ["RATE"], // Allocation of Resources Without Limits
|
|
70
|
+
// Race Conditions
|
|
71
|
+
"CWE-362": ["CONC"], // Race Condition
|
|
72
|
+
"CWE-667": ["CONC"], // Improper Locking
|
|
73
|
+
// Configuration
|
|
74
|
+
"CWE-016": ["CFG"], // Configuration
|
|
75
|
+
"CWE-1188": ["CFG"], // Insecure Default Initialization
|
|
76
|
+
// SSRF
|
|
77
|
+
"CWE-918": ["CYBER", "SEC"], // Server-Side Request Forgery
|
|
78
|
+
// Denial of Service
|
|
79
|
+
"CWE-185": ["SEC", "CYBER"], // Incorrect Regular Expression
|
|
80
|
+
"CWE-1333": ["SEC", "CYBER"], // Inefficient Regular Expression (ReDoS)
|
|
81
|
+
};
|
|
82
|
+
// ─── Helpers ────────────────────────────────────────────────────────────────
|
|
83
|
+
function normalizeCwe(cwe) {
|
|
84
|
+
return cwe.replace(/^CWE-0*/, "CWE-");
|
|
85
|
+
}
|
|
86
|
+
function getExpectedPrefixes(cwes) {
|
|
87
|
+
const prefixes = new Set();
|
|
88
|
+
for (const cwe of cwes) {
|
|
89
|
+
const normalized = normalizeCwe(cwe);
|
|
90
|
+
const mapped = CWE_TO_PREFIXES[normalized];
|
|
91
|
+
if (mapped) {
|
|
92
|
+
for (const p of mapped)
|
|
93
|
+
prefixes.add(p);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
// Always consider CYBER and SEC as relevant for any security CVE
|
|
97
|
+
if (prefixes.size === 0) {
|
|
98
|
+
prefixes.add("CYBER");
|
|
99
|
+
prefixes.add("SEC");
|
|
100
|
+
}
|
|
101
|
+
return [...prefixes];
|
|
102
|
+
}
|
|
103
|
+
function findingMatchesCwes(finding, cwes) {
|
|
104
|
+
const normalizedCwes = new Set(cwes.map(normalizeCwe));
|
|
105
|
+
// Check direct CWE match on the finding
|
|
106
|
+
if (finding.cweIds) {
|
|
107
|
+
for (const fCwe of finding.cweIds) {
|
|
108
|
+
if (normalizedCwes.has(normalizeCwe(fCwe)))
|
|
109
|
+
return true;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
// Check prefix match — if the finding's rule prefix maps to one of the CVE's CWEs
|
|
113
|
+
const prefix = finding.ruleId.split("-")[0];
|
|
114
|
+
for (const cwe of normalizedCwes) {
|
|
115
|
+
const expectedPrefixes = CWE_TO_PREFIXES[cwe] ?? [];
|
|
116
|
+
if (expectedPrefixes.includes(prefix))
|
|
117
|
+
return true;
|
|
118
|
+
}
|
|
119
|
+
return false;
|
|
120
|
+
}
|
|
121
|
+
function detectLanguage(filePath) {
|
|
122
|
+
const ext = extname(filePath).toLowerCase();
|
|
123
|
+
return EXT_TO_LANG[ext] ?? "javascript";
|
|
124
|
+
}
|
|
125
|
+
// ─── CVE Loading ────────────────────────────────────────────────────────────
|
|
126
|
+
export function loadCveFiles(repoPath) {
|
|
127
|
+
const cvesDir = join(repoPath, "CVEs");
|
|
128
|
+
if (!existsSync(cvesDir)) {
|
|
129
|
+
throw new Error(`OpenSSF CVE Benchmark not found at ${cvesDir}. Clone it with:\n git clone https://github.com/ossf-cve-benchmark/ossf-cve-benchmark.git`);
|
|
130
|
+
}
|
|
131
|
+
const files = readdirSync(cvesDir).filter((f) => f.endsWith(".json"));
|
|
132
|
+
const cves = [];
|
|
133
|
+
for (const file of files) {
|
|
134
|
+
const raw = readFileSync(join(cvesDir, file), "utf-8");
|
|
135
|
+
const cve = JSON.parse(raw);
|
|
136
|
+
// Only include complete, published CVEs
|
|
137
|
+
if (cve.state === "PUBLISHED" && cve.prePatch?.weaknesses?.length > 0 && cve.postPatch?.commit) {
|
|
138
|
+
cves.push(cve);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
return cves;
|
|
142
|
+
}
|
|
143
|
+
// ─── Git Checkout Helpers ───────────────────────────────────────────────────
|
|
144
|
+
function ensureSourcesDir(repoPath) {
|
|
145
|
+
const sourcesDir = join(repoPath, "work", "sources");
|
|
146
|
+
if (!existsSync(sourcesDir)) {
|
|
147
|
+
mkdirSync(sourcesDir, { recursive: true });
|
|
148
|
+
}
|
|
149
|
+
return sourcesDir;
|
|
150
|
+
}
|
|
151
|
+
function checkoutCommit(cve, commit, sourcesDir) {
|
|
152
|
+
const cveDir = join(sourcesDir, cve.CVE);
|
|
153
|
+
// Check if the CVE-specific repo exists at the ossf-cve-benchmark org
|
|
154
|
+
const repoUrl = `https://github.com/ossf-cve-benchmark/${cve.CVE}.git`;
|
|
155
|
+
if (!existsSync(cveDir)) {
|
|
156
|
+
try {
|
|
157
|
+
execSync(`git clone --quiet "${repoUrl}" "${cveDir}"`, {
|
|
158
|
+
stdio: "pipe",
|
|
159
|
+
timeout: 60_000,
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
catch {
|
|
163
|
+
// Fallback to the original repository URL from the CVE metadata
|
|
164
|
+
try {
|
|
165
|
+
execSync(`git clone --quiet "${cve.repository}" "${cveDir}"`, {
|
|
166
|
+
stdio: "pipe",
|
|
167
|
+
timeout: 120_000,
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
catch {
|
|
171
|
+
return undefined;
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
try {
|
|
176
|
+
execSync(`git checkout --quiet "${commit}"`, {
|
|
177
|
+
cwd: cveDir,
|
|
178
|
+
stdio: "pipe",
|
|
179
|
+
timeout: 30_000,
|
|
180
|
+
});
|
|
181
|
+
return cveDir;
|
|
182
|
+
}
|
|
183
|
+
catch {
|
|
184
|
+
return undefined;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
function readWeaknessFiles(checkoutDir, weaknesses) {
|
|
188
|
+
const files = [];
|
|
189
|
+
const seen = new Set();
|
|
190
|
+
for (const w of weaknesses) {
|
|
191
|
+
if (seen.has(w.location.file))
|
|
192
|
+
continue;
|
|
193
|
+
seen.add(w.location.file);
|
|
194
|
+
const fullPath = join(checkoutDir, w.location.file);
|
|
195
|
+
if (!existsSync(fullPath))
|
|
196
|
+
continue;
|
|
197
|
+
const content = readFileSync(fullPath, "utf-8");
|
|
198
|
+
const language = detectLanguage(w.location.file);
|
|
199
|
+
files.push({ path: w.location.file, content, language });
|
|
200
|
+
}
|
|
201
|
+
return files;
|
|
202
|
+
}
|
|
203
|
+
// ─── Evaluation ─────────────────────────────────────────────────────────────
|
|
204
|
+
function evaluateCveCode(files, cwes) {
|
|
205
|
+
const allFindings = [];
|
|
206
|
+
for (const file of files) {
|
|
207
|
+
const verdict = evaluateWithTribunal(file.content, file.language);
|
|
208
|
+
allFindings.push(...verdict.findings);
|
|
209
|
+
}
|
|
210
|
+
const normalizedCwes = cwes.map(normalizeCwe);
|
|
211
|
+
const matchedCwes = [];
|
|
212
|
+
const missedCwes = [];
|
|
213
|
+
for (const cwe of normalizedCwes) {
|
|
214
|
+
const expectedPrefixes = CWE_TO_PREFIXES[cwe] ?? ["CYBER", "SEC"];
|
|
215
|
+
const matched = allFindings.some((f) => findingMatchesCwes(f, [cwe]) || expectedPrefixes.includes(f.ruleId.split("-")[0]));
|
|
216
|
+
if (matched) {
|
|
217
|
+
matchedCwes.push(cwe);
|
|
218
|
+
}
|
|
219
|
+
else {
|
|
220
|
+
missedCwes.push(cwe);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
return { findings: allFindings, matchedCwes, missedCwes };
|
|
224
|
+
}
|
|
225
|
+
export function evaluateSingleCve(cve, sourcesDir) {
|
|
226
|
+
const language = detectLanguage(cve.prePatch.weaknesses[0]?.location.file ?? "index.js");
|
|
227
|
+
// Checkout pre-patch (vulnerable) code
|
|
228
|
+
const prePatchDir = checkoutCommit(cve, cve.prePatch.commit, sourcesDir);
|
|
229
|
+
if (!prePatchDir) {
|
|
230
|
+
return {
|
|
231
|
+
cve: cve.CVE,
|
|
232
|
+
cwes: cve.CWEs,
|
|
233
|
+
language,
|
|
234
|
+
detected: false,
|
|
235
|
+
cleanOnPatch: true,
|
|
236
|
+
prePatchFindings: [],
|
|
237
|
+
postPatchFindings: [],
|
|
238
|
+
matchedCwes: [],
|
|
239
|
+
missedCwes: cve.CWEs.map(normalizeCwe),
|
|
240
|
+
error: "Failed to checkout pre-patch commit",
|
|
241
|
+
};
|
|
242
|
+
}
|
|
243
|
+
const prePatchFiles = readWeaknessFiles(prePatchDir, cve.prePatch.weaknesses);
|
|
244
|
+
if (prePatchFiles.length === 0) {
|
|
245
|
+
return {
|
|
246
|
+
cve: cve.CVE,
|
|
247
|
+
cwes: cve.CWEs,
|
|
248
|
+
language,
|
|
249
|
+
detected: false,
|
|
250
|
+
cleanOnPatch: true,
|
|
251
|
+
prePatchFindings: [],
|
|
252
|
+
postPatchFindings: [],
|
|
253
|
+
matchedCwes: [],
|
|
254
|
+
missedCwes: cve.CWEs.map(normalizeCwe),
|
|
255
|
+
error: "Weakness files not found in checkout",
|
|
256
|
+
};
|
|
257
|
+
}
|
|
258
|
+
// Evaluate pre-patch
|
|
259
|
+
const prePatchEval = evaluateCveCode(prePatchFiles, cve.CWEs);
|
|
260
|
+
const detected = prePatchEval.matchedCwes.length > 0;
|
|
261
|
+
// Checkout post-patch (fixed) code
|
|
262
|
+
const postPatchDir = checkoutCommit(cve, cve.postPatch.commit, sourcesDir);
|
|
263
|
+
let postPatchFindings = [];
|
|
264
|
+
let cleanOnPatch = true;
|
|
265
|
+
if (postPatchDir) {
|
|
266
|
+
const postPatchFiles = readWeaknessFiles(postPatchDir, cve.prePatch.weaknesses);
|
|
267
|
+
if (postPatchFiles.length > 0) {
|
|
268
|
+
const expectedPrefixes = getExpectedPrefixes(cve.CWEs);
|
|
269
|
+
const postEval = evaluateCveCode(postPatchFiles, cve.CWEs);
|
|
270
|
+
// Only count findings with relevant prefixes as FPs on patched code
|
|
271
|
+
postPatchFindings = postEval.findings.filter((f) => expectedPrefixes.includes(f.ruleId.split("-")[0]));
|
|
272
|
+
cleanOnPatch = postPatchFindings.length === 0;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
return {
|
|
276
|
+
cve: cve.CVE,
|
|
277
|
+
cwes: cve.CWEs.map(normalizeCwe),
|
|
278
|
+
language,
|
|
279
|
+
detected,
|
|
280
|
+
cleanOnPatch,
|
|
281
|
+
prePatchFindings: prePatchEval.findings,
|
|
282
|
+
postPatchFindings,
|
|
283
|
+
matchedCwes: prePatchEval.matchedCwes,
|
|
284
|
+
missedCwes: prePatchEval.missedCwes,
|
|
285
|
+
};
|
|
286
|
+
}
|
|
287
|
+
// ─── Aggregate Results ──────────────────────────────────────────────────────
|
|
288
|
+
export function computeOpenSSFMetrics(results) {
|
|
289
|
+
const evaluated = results.filter((r) => !r.error);
|
|
290
|
+
const detected = evaluated.filter((r) => r.detected);
|
|
291
|
+
const cleanOnPatch = evaluated.filter((r) => r.cleanOnPatch);
|
|
292
|
+
// Per-CWE breakdown
|
|
293
|
+
const perCwe = {};
|
|
294
|
+
for (const r of evaluated) {
|
|
295
|
+
for (const cwe of r.cwes) {
|
|
296
|
+
if (!perCwe[cwe])
|
|
297
|
+
perCwe[cwe] = { total: 0, detected: 0, rate: 0 };
|
|
298
|
+
perCwe[cwe].total++;
|
|
299
|
+
if (r.matchedCwes.includes(cwe))
|
|
300
|
+
perCwe[cwe].detected++;
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
for (const entry of Object.values(perCwe)) {
|
|
304
|
+
entry.rate = entry.total > 0 ? entry.detected / entry.total : 0;
|
|
305
|
+
}
|
|
306
|
+
const detectionRate = evaluated.length > 0 ? detected.length / evaluated.length : 0;
|
|
307
|
+
// Precision: among detected CVEs, how many had no FP on patch
|
|
308
|
+
const truePositives = detected.filter((r) => r.cleanOnPatch).length;
|
|
309
|
+
const falsePositives = detected.filter((r) => !r.cleanOnPatch).length;
|
|
310
|
+
const falseNegatives = evaluated.length - detected.length;
|
|
311
|
+
const precision = truePositives + falsePositives > 0 ? truePositives / (truePositives + falsePositives) : 1;
|
|
312
|
+
const recall = truePositives + falseNegatives > 0 ? truePositives / (truePositives + falseNegatives) : 1;
|
|
313
|
+
const f1Score = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
|
|
314
|
+
return {
|
|
315
|
+
timestamp: new Date().toISOString(),
|
|
316
|
+
totalCves: results.length,
|
|
317
|
+
evaluated: evaluated.length,
|
|
318
|
+
skipped: results.length - evaluated.length,
|
|
319
|
+
detected: detected.length,
|
|
320
|
+
missed: evaluated.length - detected.length,
|
|
321
|
+
cleanOnPatch: cleanOnPatch.length,
|
|
322
|
+
falsePositiveOnPatch: evaluated.length - cleanOnPatch.length,
|
|
323
|
+
detectionRate,
|
|
324
|
+
precision,
|
|
325
|
+
recall,
|
|
326
|
+
f1Score,
|
|
327
|
+
perCwe,
|
|
328
|
+
results,
|
|
329
|
+
};
|
|
330
|
+
}
|
|
331
|
+
// ─── BenchmarkCase Conversion ───────────────────────────────────────────────
|
|
332
|
+
/**
|
|
333
|
+
* Convert OpenSSF CVE results into BenchmarkCase[] format for use
|
|
334
|
+
* in the Judges LLM benchmark pipeline. Only includes CVEs where:
|
|
335
|
+
* - The vulnerable code was successfully checked out
|
|
336
|
+
* - At least one weakness file was found
|
|
337
|
+
* - CWEs map to known judge prefixes
|
|
338
|
+
*/
|
|
339
|
+
export function convertToBenchmarkCases(cves, sourcesDir) {
|
|
340
|
+
const cases = [];
|
|
341
|
+
for (const cve of cves) {
|
|
342
|
+
const prePatchDir = checkoutCommit(cve, cve.prePatch.commit, sourcesDir);
|
|
343
|
+
if (!prePatchDir)
|
|
344
|
+
continue;
|
|
345
|
+
const files = readWeaknessFiles(prePatchDir, cve.prePatch.weaknesses);
|
|
346
|
+
if (files.length === 0)
|
|
347
|
+
continue;
|
|
348
|
+
const expectedPrefixes = getExpectedPrefixes(cve.CWEs);
|
|
349
|
+
if (expectedPrefixes.length === 0)
|
|
350
|
+
continue;
|
|
351
|
+
// Generate expected rule IDs from CWE→prefix mapping
|
|
352
|
+
const expectedRuleIds = expectedPrefixes.map((p) => `${p}-001`);
|
|
353
|
+
// Use the primary weakness file as the main code
|
|
354
|
+
const primaryFile = files[0];
|
|
355
|
+
const code = primaryFile.content.length > 4000
|
|
356
|
+
? primaryFile.content.slice(0, 4000) + "\n// ... truncated for benchmark"
|
|
357
|
+
: primaryFile.content;
|
|
358
|
+
// Determine category from CWE
|
|
359
|
+
const category = inferCategory(cve.CWEs);
|
|
360
|
+
const benchmarkCase = {
|
|
361
|
+
id: `openssf-${cve.CVE.toLowerCase()}`,
|
|
362
|
+
description: `Real-world CVE: ${cve.CVE} (${cve.CWEs.join(", ")}) — ${cve.prePatch.weaknesses[0]?.explanation ?? "security vulnerability"}`,
|
|
363
|
+
language: primaryFile.language,
|
|
364
|
+
code,
|
|
365
|
+
expectedRuleIds,
|
|
366
|
+
acceptablePrefixes: [...new Set([...expectedPrefixes, "CYBER", "SEC"])],
|
|
367
|
+
category,
|
|
368
|
+
difficulty: "hard",
|
|
369
|
+
aiSource: "openssf-cve-benchmark",
|
|
370
|
+
};
|
|
371
|
+
// Include additional files for multi-file cases
|
|
372
|
+
if (files.length > 1) {
|
|
373
|
+
benchmarkCase.files = files.slice(1).map((f) => ({
|
|
374
|
+
path: f.path,
|
|
375
|
+
content: f.content.length > 4000 ? f.content.slice(0, 4000) + "\n// ... truncated" : f.content,
|
|
376
|
+
language: f.language,
|
|
377
|
+
}));
|
|
378
|
+
}
|
|
379
|
+
cases.push(benchmarkCase);
|
|
380
|
+
}
|
|
381
|
+
return cases;
|
|
382
|
+
}
|
|
383
|
+
function inferCategory(cwes) {
|
|
384
|
+
const normalized = cwes.map(normalizeCwe);
|
|
385
|
+
for (const cwe of normalized) {
|
|
386
|
+
if (["CWE-89", "CWE-78", "CWE-94", "CWE-134"].includes(cwe))
|
|
387
|
+
return "injection";
|
|
388
|
+
if (["CWE-79"].includes(cwe))
|
|
389
|
+
return "xss";
|
|
390
|
+
if (["CWE-22", "CWE-73"].includes(cwe))
|
|
391
|
+
return "path-traversal";
|
|
392
|
+
if (["CWE-287", "CWE-798", "CWE-862", "CWE-863"].includes(cwe))
|
|
393
|
+
return "auth";
|
|
394
|
+
if (["CWE-327", "CWE-328", "CWE-916", "CWE-330"].includes(cwe))
|
|
395
|
+
return "crypto";
|
|
396
|
+
if (["CWE-502", "CWE-915", "CWE-471"].includes(cwe))
|
|
397
|
+
return "prototype-pollution";
|
|
398
|
+
if (["CWE-200", "CWE-209", "CWE-312", "CWE-532"].includes(cwe))
|
|
399
|
+
return "data-exposure";
|
|
400
|
+
if (["CWE-400", "CWE-770", "CWE-185", "CWE-1333"].includes(cwe))
|
|
401
|
+
return "denial-of-service";
|
|
402
|
+
if (["CWE-918"].includes(cwe))
|
|
403
|
+
return "ssrf";
|
|
404
|
+
if (["CWE-362", "CWE-667"].includes(cwe))
|
|
405
|
+
return "concurrency";
|
|
406
|
+
}
|
|
407
|
+
return "security";
|
|
408
|
+
}
|
|
409
|
+
// ─── Report Formatting ──────────────────────────────────────────────────────
|
|
410
|
+
export function formatOpenSSFReport(result) {
|
|
411
|
+
const lines = [];
|
|
412
|
+
lines.push("# OpenSSF CVE Benchmark Report");
|
|
413
|
+
lines.push("");
|
|
414
|
+
lines.push(`**Date:** ${result.timestamp}`);
|
|
415
|
+
lines.push(`**CVEs Evaluated:** ${result.evaluated} of ${result.totalCves} (${result.skipped} skipped)`);
|
|
416
|
+
lines.push("");
|
|
417
|
+
lines.push("## Summary");
|
|
418
|
+
lines.push("");
|
|
419
|
+
lines.push(`| Metric | Value |`);
|
|
420
|
+
lines.push(`|--------|-------|`);
|
|
421
|
+
lines.push(`| Detection Rate | ${(result.detectionRate * 100).toFixed(1)}% (${result.detected}/${result.evaluated}) |`);
|
|
422
|
+
lines.push(`| Precision | ${(result.precision * 100).toFixed(1)}% |`);
|
|
423
|
+
lines.push(`| Recall | ${(result.recall * 100).toFixed(1)}% |`);
|
|
424
|
+
lines.push(`| F1 Score | ${(result.f1Score * 100).toFixed(1)}% |`);
|
|
425
|
+
lines.push(`| Clean on Patch | ${result.cleanOnPatch}/${result.evaluated} |`);
|
|
426
|
+
lines.push(`| False Positives on Patch | ${result.falsePositiveOnPatch} |`);
|
|
427
|
+
lines.push("");
|
|
428
|
+
// Per-CWE breakdown
|
|
429
|
+
const cweEntries = Object.entries(result.perCwe).sort((a, b) => b[1].total - a[1].total);
|
|
430
|
+
if (cweEntries.length > 0) {
|
|
431
|
+
lines.push("## Per-CWE Detection Rates");
|
|
432
|
+
lines.push("");
|
|
433
|
+
lines.push("| CWE | Total | Detected | Rate |");
|
|
434
|
+
lines.push("|-----|-------|----------|------|");
|
|
435
|
+
for (const [cwe, data] of cweEntries) {
|
|
436
|
+
lines.push(`| ${cwe} | ${data.total} | ${data.detected} | ${(data.rate * 100).toFixed(0)}% |`);
|
|
437
|
+
}
|
|
438
|
+
lines.push("");
|
|
439
|
+
}
|
|
440
|
+
// Missed CVEs
|
|
441
|
+
const missed = result.results.filter((r) => !r.error && !r.detected);
|
|
442
|
+
if (missed.length > 0) {
|
|
443
|
+
lines.push("## Missed CVEs");
|
|
444
|
+
lines.push("");
|
|
445
|
+
for (const r of missed.slice(0, 20)) {
|
|
446
|
+
lines.push(`- **${r.cve}** (${r.cwes.join(", ")}): ${r.language}`);
|
|
447
|
+
}
|
|
448
|
+
if (missed.length > 20) {
|
|
449
|
+
lines.push(`- ... and ${missed.length - 20} more`);
|
|
450
|
+
}
|
|
451
|
+
lines.push("");
|
|
452
|
+
}
|
|
453
|
+
// False positives on patched code
|
|
454
|
+
const fpOnPatch = result.results.filter((r) => !r.error && !r.cleanOnPatch);
|
|
455
|
+
if (fpOnPatch.length > 0) {
|
|
456
|
+
lines.push("## False Positives on Patched Code");
|
|
457
|
+
lines.push("");
|
|
458
|
+
for (const r of fpOnPatch.slice(0, 10)) {
|
|
459
|
+
const fpRules = [...new Set(r.postPatchFindings.map((f) => f.ruleId))];
|
|
460
|
+
lines.push(`- **${r.cve}**: ${fpRules.join(", ")}`);
|
|
461
|
+
}
|
|
462
|
+
if (fpOnPatch.length > 10) {
|
|
463
|
+
lines.push(`- ... and ${fpOnPatch.length - 10} more`);
|
|
464
|
+
}
|
|
465
|
+
lines.push("");
|
|
466
|
+
}
|
|
467
|
+
return lines.join("\n");
|
|
468
|
+
}
|
|
469
|
+
// ─── CLI Entry Point ────────────────────────────────────────────────────────
|
|
470
|
+
export function runOpenSSFCveBenchmark(argv) {
|
|
471
|
+
const subcommand = argv[3] || "run";
|
|
472
|
+
if (subcommand === "--help" || subcommand === "-h") {
|
|
473
|
+
console.log(`
|
|
474
|
+
Judges Panel — OpenSSF CVE Benchmark
|
|
475
|
+
|
|
476
|
+
Evaluates Judges against 200+ real-world CVEs from the OpenSSF CVE Benchmark.
|
|
477
|
+
Requires the benchmark repo to be cloned locally.
|
|
478
|
+
|
|
479
|
+
USAGE:
|
|
480
|
+
judges openssf-cve run [options] Run benchmark against all CVEs
|
|
481
|
+
judges openssf-cve convert [options] Convert CVEs to BenchmarkCase format
|
|
482
|
+
judges openssf-cve report [options] Generate markdown report from results
|
|
483
|
+
|
|
484
|
+
OPTIONS:
|
|
485
|
+
--repo, -r <path> Path to the ossf-cve-benchmark repo (default: ../ossf-cve-benchmark)
|
|
486
|
+
--cve <id> Evaluate a single CVE (e.g. CVE-2018-16492)
|
|
487
|
+
--output, -o <path> Save results to file
|
|
488
|
+
--format <fmt> Output: text, json, markdown (default: text)
|
|
489
|
+
|
|
490
|
+
SETUP:
|
|
491
|
+
git clone https://github.com/ossf-cve-benchmark/ossf-cve-benchmark.git
|
|
492
|
+
cd ossf-cve-benchmark && npm i && npm run build
|
|
493
|
+
`);
|
|
494
|
+
process.exit(0);
|
|
495
|
+
}
|
|
496
|
+
let repoPath = resolve("..", "ossf-cve-benchmark");
|
|
497
|
+
let singleCve;
|
|
498
|
+
let outputPath;
|
|
499
|
+
let format = "text";
|
|
500
|
+
for (let i = 4; i < argv.length; i++) {
|
|
501
|
+
const arg = argv[i];
|
|
502
|
+
if (arg === "--repo" || arg === "-r")
|
|
503
|
+
repoPath = resolve(argv[++i]);
|
|
504
|
+
else if (arg === "--cve")
|
|
505
|
+
singleCve = argv[++i];
|
|
506
|
+
else if (arg === "--output" || arg === "-o")
|
|
507
|
+
outputPath = argv[++i];
|
|
508
|
+
else if (arg === "--format")
|
|
509
|
+
format = argv[++i];
|
|
510
|
+
}
|
|
511
|
+
if (!existsSync(repoPath)) {
|
|
512
|
+
console.error(`OpenSSF CVE Benchmark repo not found at: ${repoPath}`);
|
|
513
|
+
console.error("Clone it with:");
|
|
514
|
+
console.error(" git clone https://github.com/ossf-cve-benchmark/ossf-cve-benchmark.git");
|
|
515
|
+
process.exit(1);
|
|
516
|
+
}
|
|
517
|
+
const cves = loadCveFiles(repoPath);
|
|
518
|
+
console.log(`Loaded ${cves.length} published CVEs from ${repoPath}`);
|
|
519
|
+
const sourcesDir = ensureSourcesDir(repoPath);
|
|
520
|
+
if (subcommand === "convert") {
|
|
521
|
+
console.log("Converting CVEs to BenchmarkCase format...");
|
|
522
|
+
const cases = convertToBenchmarkCases(cves, sourcesDir);
|
|
523
|
+
const output = JSON.stringify(cases, null, 2);
|
|
524
|
+
if (outputPath) {
|
|
525
|
+
writeFileSync(outputPath, output, "utf-8");
|
|
526
|
+
console.log(`Wrote ${cases.length} benchmark cases to ${outputPath}`);
|
|
527
|
+
}
|
|
528
|
+
else {
|
|
529
|
+
console.log(output);
|
|
530
|
+
}
|
|
531
|
+
return;
|
|
532
|
+
}
|
|
533
|
+
// Run evaluation
|
|
534
|
+
const targetCves = singleCve ? cves.filter((c) => c.CVE === singleCve) : cves;
|
|
535
|
+
if (targetCves.length === 0) {
|
|
536
|
+
console.error(singleCve ? `CVE ${singleCve} not found in dataset` : "No CVEs to evaluate");
|
|
537
|
+
process.exit(1);
|
|
538
|
+
}
|
|
539
|
+
console.log(`Evaluating ${targetCves.length} CVEs...`);
|
|
540
|
+
const results = [];
|
|
541
|
+
for (let i = 0; i < targetCves.length; i++) {
|
|
542
|
+
const cve = targetCves[i];
|
|
543
|
+
const pct = Math.round(((i + 1) / targetCves.length) * 100);
|
|
544
|
+
process.stdout.write(`\r[${i + 1}/${targetCves.length}] ${pct}% ${cve.CVE}`);
|
|
545
|
+
const result = evaluateSingleCve(cve, sourcesDir);
|
|
546
|
+
results.push(result);
|
|
547
|
+
const icon = result.error ? "⚠️" : result.detected ? "✅" : "❌";
|
|
548
|
+
process.stdout.write(`\r[${i + 1}/${targetCves.length}] ${pct}% ${icon} ${cve.CVE} \n`);
|
|
549
|
+
}
|
|
550
|
+
const metrics = computeOpenSSFMetrics(results);
|
|
551
|
+
if (format === "json") {
|
|
552
|
+
const output = JSON.stringify(metrics, null, 2);
|
|
553
|
+
if (outputPath) {
|
|
554
|
+
writeFileSync(outputPath, output, "utf-8");
|
|
555
|
+
console.log(`Results saved to ${outputPath}`);
|
|
556
|
+
}
|
|
557
|
+
else {
|
|
558
|
+
console.log(output);
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
else if (format === "markdown") {
|
|
562
|
+
const report = formatOpenSSFReport(metrics);
|
|
563
|
+
if (outputPath) {
|
|
564
|
+
writeFileSync(outputPath, report, "utf-8");
|
|
565
|
+
console.log(`Report saved to ${outputPath}`);
|
|
566
|
+
}
|
|
567
|
+
else {
|
|
568
|
+
console.log(report);
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
else {
|
|
572
|
+
// Text summary
|
|
573
|
+
console.log("\n─── OpenSSF CVE Benchmark Results ───\n");
|
|
574
|
+
console.log(` Evaluated: ${metrics.evaluated}/${metrics.totalCves} CVEs`);
|
|
575
|
+
console.log(` Detected: ${metrics.detected} (${(metrics.detectionRate * 100).toFixed(1)}%)`);
|
|
576
|
+
console.log(` Missed: ${metrics.missed}`);
|
|
577
|
+
console.log(` Clean on Patch: ${metrics.cleanOnPatch}/${metrics.evaluated}`);
|
|
578
|
+
console.log(` FP on Patch: ${metrics.falsePositiveOnPatch}`);
|
|
579
|
+
console.log(` Precision: ${(metrics.precision * 100).toFixed(1)}%`);
|
|
580
|
+
console.log(` Recall: ${(metrics.recall * 100).toFixed(1)}%`);
|
|
581
|
+
console.log(` F1 Score: ${(metrics.f1Score * 100).toFixed(1)}%`);
|
|
582
|
+
if (outputPath) {
|
|
583
|
+
writeFileSync(outputPath, JSON.stringify(metrics, null, 2), "utf-8");
|
|
584
|
+
console.log(`\nFull results saved to ${outputPath}`);
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
// ─── Adapter Registration ───────────────────────────────────────────────────
|
|
589
|
+
function readJudgesVersion() {
|
|
590
|
+
try {
|
|
591
|
+
const pkg = JSON.parse(readFileSync(resolve("package.json"), "utf-8"));
|
|
592
|
+
return pkg.version ?? "unknown";
|
|
593
|
+
}
|
|
594
|
+
catch {
|
|
595
|
+
return "unknown";
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
/**
|
|
599
|
+
* Convert OpenSSFBenchmarkResult → ExternalBenchmarkResult for the registry.
|
|
600
|
+
*/
|
|
601
|
+
export function toExternalResult(metrics) {
|
|
602
|
+
return {
|
|
603
|
+
suiteId: "openssf-cve",
|
|
604
|
+
suiteName: "OpenSSF CVE Benchmark",
|
|
605
|
+
suiteUrl: "https://github.com/ossf-cve-benchmark/ossf-cve-benchmark",
|
|
606
|
+
timestamp: metrics.timestamp,
|
|
607
|
+
judgesVersion: readJudgesVersion(),
|
|
608
|
+
totalItems: metrics.totalCves,
|
|
609
|
+
evaluatedItems: metrics.evaluated,
|
|
610
|
+
skippedItems: metrics.skipped,
|
|
611
|
+
precision: metrics.precision,
|
|
612
|
+
recall: metrics.recall,
|
|
613
|
+
f1Score: metrics.f1Score,
|
|
614
|
+
detectionRate: metrics.detectionRate,
|
|
615
|
+
truePositives: metrics.detected,
|
|
616
|
+
falsePositives: metrics.falsePositiveOnPatch,
|
|
617
|
+
falseNegatives: metrics.missed,
|
|
618
|
+
perCategory: metrics.perCwe,
|
|
619
|
+
rawData: metrics,
|
|
620
|
+
};
|
|
621
|
+
}
|
|
622
|
+
const openSSFAdapter = {
|
|
623
|
+
suiteId: "openssf-cve",
|
|
624
|
+
suiteName: "OpenSSF CVE Benchmark",
|
|
625
|
+
suiteUrl: "https://github.com/ossf-cve-benchmark/ossf-cve-benchmark",
|
|
626
|
+
defaultRepoPath: "../ossf-cve-benchmark",
|
|
627
|
+
description: "200+ real-world JS/TS CVEs with pre-patch and post-patch commits",
|
|
628
|
+
validate(repoPath) {
|
|
629
|
+
if (!existsSync(repoPath)) {
|
|
630
|
+
return `Repo not found at ${repoPath}. Clone with: git clone https://github.com/ossf-cve-benchmark/ossf-cve-benchmark.git`;
|
|
631
|
+
}
|
|
632
|
+
if (!existsSync(join(repoPath, "CVEs"))) {
|
|
633
|
+
return `CVEs directory not found at ${repoPath}/CVEs. Is this the correct repo?`;
|
|
634
|
+
}
|
|
635
|
+
return undefined;
|
|
636
|
+
},
|
|
637
|
+
run(config) {
|
|
638
|
+
const cves = loadCveFiles(config.repoPath);
|
|
639
|
+
console.log(` Loaded ${cves.length} published CVEs`);
|
|
640
|
+
const sourcesDir = ensureSourcesDir(config.repoPath);
|
|
641
|
+
const targetCves = config.singleItem ? cves.filter((c) => c.CVE === config.singleItem) : cves;
|
|
642
|
+
if (targetCves.length === 0) {
|
|
643
|
+
return toExternalResult(computeOpenSSFMetrics([]));
|
|
644
|
+
}
|
|
645
|
+
console.log(` Evaluating ${targetCves.length} CVEs...`);
|
|
646
|
+
const results = [];
|
|
647
|
+
for (let i = 0; i < targetCves.length; i++) {
|
|
648
|
+
const cve = targetCves[i];
|
|
649
|
+
const pct = Math.round(((i + 1) / targetCves.length) * 100);
|
|
650
|
+
process.stdout.write(`\r [${i + 1}/${targetCves.length}] ${pct}% ${cve.CVE}`);
|
|
651
|
+
const result = evaluateSingleCve(cve, sourcesDir);
|
|
652
|
+
results.push(result);
|
|
653
|
+
const icon = result.error ? "⚠️" : result.detected ? "✅" : "❌";
|
|
654
|
+
process.stdout.write(`\r [${i + 1}/${targetCves.length}] ${pct}% ${icon} ${cve.CVE} \n`);
|
|
655
|
+
}
|
|
656
|
+
return toExternalResult(computeOpenSSFMetrics(results));
|
|
657
|
+
},
|
|
658
|
+
};
|
|
659
|
+
registerBenchmarkAdapter(openSSFAdapter);
|