@kevinrabun/judges-cli 3.128.3 → 3.129.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,659 @@
1
+ /**
2
+ * OpenSSF CVE Benchmark Integration
3
+ *
4
+ * Runs the Judges evaluation engine against the OpenSSF CVE Benchmark dataset
5
+ * (https://github.com/ossf-cve-benchmark/ossf-cve-benchmark) — 200+ real-world
6
+ * JavaScript/TypeScript CVEs with pre-patch (vulnerable) and post-patch (fixed)
7
+ * git commits.
8
+ *
9
+ * Two modes:
10
+ * 1. Deterministic (L1): Runs Judges' pattern-based evaluators against each CVE.
11
+ * 2. LLM integration: Converts passing CVE cases into BenchmarkCase format
12
+ * for inclusion in the LLM benchmark pipeline.
13
+ *
14
+ * Usage:
15
+ * judges openssf-cve run [--repo <path>] [--cve <id>] [--format json|text|markdown]
16
+ * judges openssf-cve convert [--repo <path>] # Convert to BenchmarkCase[]
17
+ */
18
+ import { existsSync, readFileSync, readdirSync, writeFileSync, mkdirSync } from "fs";
19
+ import { resolve, join, extname } from "path";
20
+ import { execSync } from "child_process";
21
+ import { evaluateWithTribunal } from "../evaluators/index.js";
22
+ import { EXT_TO_LANG } from "../ext-to-lang.js";
23
+ import { registerBenchmarkAdapter } from "./external-benchmarks.js";
24
+ // ─── CWE → Judge Prefix Mapping ────────────────────────────────────────────
25
+ /**
26
+ * Maps CWE IDs to the judge rule prefixes that are expected to detect them.
27
+ * This is the reverse of the PREFIX_MAP in security-ids.ts, extended with
28
+ * additional CWE coverage from individual judges.
29
+ */
30
+ const CWE_TO_PREFIXES = {
31
+ // Injection
32
+ "CWE-078": ["CYBER", "SEC"], // OS Command Injection
33
+ "CWE-079": ["CYBER", "SEC", "XSS", "FW"], // XSS
34
+ "CWE-089": ["CYBER", "SEC", "DB"], // SQL Injection
35
+ "CWE-094": ["CYBER", "SEC"], // Code Injection
36
+ "CWE-095": ["CYBER", "SEC"], // Eval Injection
37
+ "CWE-917": ["CYBER", "SEC"], // Expression Language Injection
38
+ "CWE-134": ["CYBER", "SEC"], // Format String
39
+ "CWE-943": ["DB", "SEC"], // NoSQL Injection
40
+ // Path Traversal / File Access
41
+ "CWE-022": ["CYBER", "SEC"], // Path Traversal
42
+ "CWE-073": ["CYBER", "SEC"], // External Control of File Name
43
+ "CWE-434": ["CYBER", "SEC"], // Unrestricted Upload
44
+ // Auth / Crypto
45
+ "CWE-287": ["AUTH", "CYBER"], // Improper Authentication
46
+ "CWE-798": ["AUTH", "CFG", "DATA", "CYBER"], // Hard-coded Credentials
47
+ "CWE-327": ["CRYPTO", "CYBER"], // Use of Broken Crypto Algorithm
48
+ "CWE-328": ["CRYPTO", "CYBER"], // Weak Hash
49
+ "CWE-330": ["CRYPTO", "CYBER", "AICS"], // Insufficient Randomness
50
+ "CWE-916": ["CRYPTO", "AUTH", "CYBER"], // Weak Password Hash
51
+ // Access Control
52
+ "CWE-284": ["CYBER", "AUTH"], // Improper Access Control
53
+ "CWE-269": ["CYBER", "AUTH"], // Improper Privilege Management
54
+ "CWE-862": ["AUTH", "CYBER"], // Missing Authorization
55
+ "CWE-863": ["AUTH", "CYBER"], // Incorrect Authorization
56
+ // Data Exposure
57
+ "CWE-200": ["DATA", "SEC", "LOGPRIV"], // Information Exposure
58
+ "CWE-209": ["ERR", "SEC"], // Error Message Info Exposure
59
+ "CWE-312": ["DATA", "CFG"], // Cleartext Storage of Sensitive Info
60
+ "CWE-319": ["CYBER", "SEC", "DATA"], // Cleartext Transmission
61
+ "CWE-532": ["LOGPRIV"], // Insertion of Sensitive Info into Log
62
+ // Deserialization / Prototype Pollution
63
+ "CWE-502": ["CYBER", "SEC"], // Deserialization of Untrusted Data
64
+ "CWE-915": ["CYBER", "SEC"], // Improperly Controlled Modification (prototype pollution)
65
+ "CWE-471": ["CYBER", "SEC"], // Modification of Assumed-Immutable Data
66
+ // Input / Validation
67
+ "CWE-020": ["SEC", "CYBER"], // Improper Input Validation
68
+ "CWE-400": ["RATE", "CYBER"], // Uncontrolled Resource Consumption (ReDoS, etc.)
69
+ "CWE-770": ["RATE"], // Allocation of Resources Without Limits
70
+ // Race Conditions
71
+ "CWE-362": ["CONC"], // Race Condition
72
+ "CWE-667": ["CONC"], // Improper Locking
73
+ // Configuration
74
+ "CWE-016": ["CFG"], // Configuration
75
+ "CWE-1188": ["CFG"], // Insecure Default Initialization
76
+ // SSRF
77
+ "CWE-918": ["CYBER", "SEC"], // Server-Side Request Forgery
78
+ // Denial of Service
79
+ "CWE-185": ["SEC", "CYBER"], // Incorrect Regular Expression
80
+ "CWE-1333": ["SEC", "CYBER"], // Inefficient Regular Expression (ReDoS)
81
+ };
82
+ // ─── Helpers ────────────────────────────────────────────────────────────────
83
+ function normalizeCwe(cwe) {
84
+ return cwe.replace(/^CWE-0*/, "CWE-");
85
+ }
86
+ function getExpectedPrefixes(cwes) {
87
+ const prefixes = new Set();
88
+ for (const cwe of cwes) {
89
+ const normalized = normalizeCwe(cwe);
90
+ const mapped = CWE_TO_PREFIXES[normalized];
91
+ if (mapped) {
92
+ for (const p of mapped)
93
+ prefixes.add(p);
94
+ }
95
+ }
96
+ // Always consider CYBER and SEC as relevant for any security CVE
97
+ if (prefixes.size === 0) {
98
+ prefixes.add("CYBER");
99
+ prefixes.add("SEC");
100
+ }
101
+ return [...prefixes];
102
+ }
103
+ function findingMatchesCwes(finding, cwes) {
104
+ const normalizedCwes = new Set(cwes.map(normalizeCwe));
105
+ // Check direct CWE match on the finding
106
+ if (finding.cweIds) {
107
+ for (const fCwe of finding.cweIds) {
108
+ if (normalizedCwes.has(normalizeCwe(fCwe)))
109
+ return true;
110
+ }
111
+ }
112
+ // Check prefix match — if the finding's rule prefix maps to one of the CVE's CWEs
113
+ const prefix = finding.ruleId.split("-")[0];
114
+ for (const cwe of normalizedCwes) {
115
+ const expectedPrefixes = CWE_TO_PREFIXES[cwe] ?? [];
116
+ if (expectedPrefixes.includes(prefix))
117
+ return true;
118
+ }
119
+ return false;
120
+ }
121
+ function detectLanguage(filePath) {
122
+ const ext = extname(filePath).toLowerCase();
123
+ return EXT_TO_LANG[ext] ?? "javascript";
124
+ }
125
+ // ─── CVE Loading ────────────────────────────────────────────────────────────
126
+ export function loadCveFiles(repoPath) {
127
+ const cvesDir = join(repoPath, "CVEs");
128
+ if (!existsSync(cvesDir)) {
129
+ throw new Error(`OpenSSF CVE Benchmark not found at ${cvesDir}. Clone it with:\n git clone https://github.com/ossf-cve-benchmark/ossf-cve-benchmark.git`);
130
+ }
131
+ const files = readdirSync(cvesDir).filter((f) => f.endsWith(".json"));
132
+ const cves = [];
133
+ for (const file of files) {
134
+ const raw = readFileSync(join(cvesDir, file), "utf-8");
135
+ const cve = JSON.parse(raw);
136
+ // Only include complete, published CVEs
137
+ if (cve.state === "PUBLISHED" && cve.prePatch?.weaknesses?.length > 0 && cve.postPatch?.commit) {
138
+ cves.push(cve);
139
+ }
140
+ }
141
+ return cves;
142
+ }
143
+ // ─── Git Checkout Helpers ───────────────────────────────────────────────────
144
+ function ensureSourcesDir(repoPath) {
145
+ const sourcesDir = join(repoPath, "work", "sources");
146
+ if (!existsSync(sourcesDir)) {
147
+ mkdirSync(sourcesDir, { recursive: true });
148
+ }
149
+ return sourcesDir;
150
+ }
151
+ function checkoutCommit(cve, commit, sourcesDir) {
152
+ const cveDir = join(sourcesDir, cve.CVE);
153
+ // Check if the CVE-specific repo exists at the ossf-cve-benchmark org
154
+ const repoUrl = `https://github.com/ossf-cve-benchmark/${cve.CVE}.git`;
155
+ if (!existsSync(cveDir)) {
156
+ try {
157
+ execSync(`git clone --quiet "${repoUrl}" "${cveDir}"`, {
158
+ stdio: "pipe",
159
+ timeout: 60_000,
160
+ });
161
+ }
162
+ catch {
163
+ // Fallback to the original repository URL from the CVE metadata
164
+ try {
165
+ execSync(`git clone --quiet "${cve.repository}" "${cveDir}"`, {
166
+ stdio: "pipe",
167
+ timeout: 120_000,
168
+ });
169
+ }
170
+ catch {
171
+ return undefined;
172
+ }
173
+ }
174
+ }
175
+ try {
176
+ execSync(`git checkout --quiet "${commit}"`, {
177
+ cwd: cveDir,
178
+ stdio: "pipe",
179
+ timeout: 30_000,
180
+ });
181
+ return cveDir;
182
+ }
183
+ catch {
184
+ return undefined;
185
+ }
186
+ }
187
+ function readWeaknessFiles(checkoutDir, weaknesses) {
188
+ const files = [];
189
+ const seen = new Set();
190
+ for (const w of weaknesses) {
191
+ if (seen.has(w.location.file))
192
+ continue;
193
+ seen.add(w.location.file);
194
+ const fullPath = join(checkoutDir, w.location.file);
195
+ if (!existsSync(fullPath))
196
+ continue;
197
+ const content = readFileSync(fullPath, "utf-8");
198
+ const language = detectLanguage(w.location.file);
199
+ files.push({ path: w.location.file, content, language });
200
+ }
201
+ return files;
202
+ }
203
+ // ─── Evaluation ─────────────────────────────────────────────────────────────
204
+ function evaluateCveCode(files, cwes) {
205
+ const allFindings = [];
206
+ for (const file of files) {
207
+ const verdict = evaluateWithTribunal(file.content, file.language);
208
+ allFindings.push(...verdict.findings);
209
+ }
210
+ const normalizedCwes = cwes.map(normalizeCwe);
211
+ const matchedCwes = [];
212
+ const missedCwes = [];
213
+ for (const cwe of normalizedCwes) {
214
+ const expectedPrefixes = CWE_TO_PREFIXES[cwe] ?? ["CYBER", "SEC"];
215
+ const matched = allFindings.some((f) => findingMatchesCwes(f, [cwe]) || expectedPrefixes.includes(f.ruleId.split("-")[0]));
216
+ if (matched) {
217
+ matchedCwes.push(cwe);
218
+ }
219
+ else {
220
+ missedCwes.push(cwe);
221
+ }
222
+ }
223
+ return { findings: allFindings, matchedCwes, missedCwes };
224
+ }
225
+ export function evaluateSingleCve(cve, sourcesDir) {
226
+ const language = detectLanguage(cve.prePatch.weaknesses[0]?.location.file ?? "index.js");
227
+ // Checkout pre-patch (vulnerable) code
228
+ const prePatchDir = checkoutCommit(cve, cve.prePatch.commit, sourcesDir);
229
+ if (!prePatchDir) {
230
+ return {
231
+ cve: cve.CVE,
232
+ cwes: cve.CWEs,
233
+ language,
234
+ detected: false,
235
+ cleanOnPatch: true,
236
+ prePatchFindings: [],
237
+ postPatchFindings: [],
238
+ matchedCwes: [],
239
+ missedCwes: cve.CWEs.map(normalizeCwe),
240
+ error: "Failed to checkout pre-patch commit",
241
+ };
242
+ }
243
+ const prePatchFiles = readWeaknessFiles(prePatchDir, cve.prePatch.weaknesses);
244
+ if (prePatchFiles.length === 0) {
245
+ return {
246
+ cve: cve.CVE,
247
+ cwes: cve.CWEs,
248
+ language,
249
+ detected: false,
250
+ cleanOnPatch: true,
251
+ prePatchFindings: [],
252
+ postPatchFindings: [],
253
+ matchedCwes: [],
254
+ missedCwes: cve.CWEs.map(normalizeCwe),
255
+ error: "Weakness files not found in checkout",
256
+ };
257
+ }
258
+ // Evaluate pre-patch
259
+ const prePatchEval = evaluateCveCode(prePatchFiles, cve.CWEs);
260
+ const detected = prePatchEval.matchedCwes.length > 0;
261
+ // Checkout post-patch (fixed) code
262
+ const postPatchDir = checkoutCommit(cve, cve.postPatch.commit, sourcesDir);
263
+ let postPatchFindings = [];
264
+ let cleanOnPatch = true;
265
+ if (postPatchDir) {
266
+ const postPatchFiles = readWeaknessFiles(postPatchDir, cve.prePatch.weaknesses);
267
+ if (postPatchFiles.length > 0) {
268
+ const expectedPrefixes = getExpectedPrefixes(cve.CWEs);
269
+ const postEval = evaluateCveCode(postPatchFiles, cve.CWEs);
270
+ // Only count findings with relevant prefixes as FPs on patched code
271
+ postPatchFindings = postEval.findings.filter((f) => expectedPrefixes.includes(f.ruleId.split("-")[0]));
272
+ cleanOnPatch = postPatchFindings.length === 0;
273
+ }
274
+ }
275
+ return {
276
+ cve: cve.CVE,
277
+ cwes: cve.CWEs.map(normalizeCwe),
278
+ language,
279
+ detected,
280
+ cleanOnPatch,
281
+ prePatchFindings: prePatchEval.findings,
282
+ postPatchFindings,
283
+ matchedCwes: prePatchEval.matchedCwes,
284
+ missedCwes: prePatchEval.missedCwes,
285
+ };
286
+ }
287
+ // ─── Aggregate Results ──────────────────────────────────────────────────────
288
+ export function computeOpenSSFMetrics(results) {
289
+ const evaluated = results.filter((r) => !r.error);
290
+ const detected = evaluated.filter((r) => r.detected);
291
+ const cleanOnPatch = evaluated.filter((r) => r.cleanOnPatch);
292
+ // Per-CWE breakdown
293
+ const perCwe = {};
294
+ for (const r of evaluated) {
295
+ for (const cwe of r.cwes) {
296
+ if (!perCwe[cwe])
297
+ perCwe[cwe] = { total: 0, detected: 0, rate: 0 };
298
+ perCwe[cwe].total++;
299
+ if (r.matchedCwes.includes(cwe))
300
+ perCwe[cwe].detected++;
301
+ }
302
+ }
303
+ for (const entry of Object.values(perCwe)) {
304
+ entry.rate = entry.total > 0 ? entry.detected / entry.total : 0;
305
+ }
306
+ const detectionRate = evaluated.length > 0 ? detected.length / evaluated.length : 0;
307
+ // Precision: among detected CVEs, how many had no FP on patch
308
+ const truePositives = detected.filter((r) => r.cleanOnPatch).length;
309
+ const falsePositives = detected.filter((r) => !r.cleanOnPatch).length;
310
+ const falseNegatives = evaluated.length - detected.length;
311
+ const precision = truePositives + falsePositives > 0 ? truePositives / (truePositives + falsePositives) : 1;
312
+ const recall = truePositives + falseNegatives > 0 ? truePositives / (truePositives + falseNegatives) : 1;
313
+ const f1Score = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
314
+ return {
315
+ timestamp: new Date().toISOString(),
316
+ totalCves: results.length,
317
+ evaluated: evaluated.length,
318
+ skipped: results.length - evaluated.length,
319
+ detected: detected.length,
320
+ missed: evaluated.length - detected.length,
321
+ cleanOnPatch: cleanOnPatch.length,
322
+ falsePositiveOnPatch: evaluated.length - cleanOnPatch.length,
323
+ detectionRate,
324
+ precision,
325
+ recall,
326
+ f1Score,
327
+ perCwe,
328
+ results,
329
+ };
330
+ }
331
+ // ─── BenchmarkCase Conversion ───────────────────────────────────────────────
332
+ /**
333
+ * Convert OpenSSF CVE results into BenchmarkCase[] format for use
334
+ * in the Judges LLM benchmark pipeline. Only includes CVEs where:
335
+ * - The vulnerable code was successfully checked out
336
+ * - At least one weakness file was found
337
+ * - CWEs map to known judge prefixes
338
+ */
339
+ export function convertToBenchmarkCases(cves, sourcesDir) {
340
+ const cases = [];
341
+ for (const cve of cves) {
342
+ const prePatchDir = checkoutCommit(cve, cve.prePatch.commit, sourcesDir);
343
+ if (!prePatchDir)
344
+ continue;
345
+ const files = readWeaknessFiles(prePatchDir, cve.prePatch.weaknesses);
346
+ if (files.length === 0)
347
+ continue;
348
+ const expectedPrefixes = getExpectedPrefixes(cve.CWEs);
349
+ if (expectedPrefixes.length === 0)
350
+ continue;
351
+ // Generate expected rule IDs from CWE→prefix mapping
352
+ const expectedRuleIds = expectedPrefixes.map((p) => `${p}-001`);
353
+ // Use the primary weakness file as the main code
354
+ const primaryFile = files[0];
355
+ const code = primaryFile.content.length > 4000
356
+ ? primaryFile.content.slice(0, 4000) + "\n// ... truncated for benchmark"
357
+ : primaryFile.content;
358
+ // Determine category from CWE
359
+ const category = inferCategory(cve.CWEs);
360
+ const benchmarkCase = {
361
+ id: `openssf-${cve.CVE.toLowerCase()}`,
362
+ description: `Real-world CVE: ${cve.CVE} (${cve.CWEs.join(", ")}) — ${cve.prePatch.weaknesses[0]?.explanation ?? "security vulnerability"}`,
363
+ language: primaryFile.language,
364
+ code,
365
+ expectedRuleIds,
366
+ acceptablePrefixes: [...new Set([...expectedPrefixes, "CYBER", "SEC"])],
367
+ category,
368
+ difficulty: "hard",
369
+ aiSource: "openssf-cve-benchmark",
370
+ };
371
+ // Include additional files for multi-file cases
372
+ if (files.length > 1) {
373
+ benchmarkCase.files = files.slice(1).map((f) => ({
374
+ path: f.path,
375
+ content: f.content.length > 4000 ? f.content.slice(0, 4000) + "\n// ... truncated" : f.content,
376
+ language: f.language,
377
+ }));
378
+ }
379
+ cases.push(benchmarkCase);
380
+ }
381
+ return cases;
382
+ }
383
+ function inferCategory(cwes) {
384
+ const normalized = cwes.map(normalizeCwe);
385
+ for (const cwe of normalized) {
386
+ if (["CWE-89", "CWE-78", "CWE-94", "CWE-134"].includes(cwe))
387
+ return "injection";
388
+ if (["CWE-79"].includes(cwe))
389
+ return "xss";
390
+ if (["CWE-22", "CWE-73"].includes(cwe))
391
+ return "path-traversal";
392
+ if (["CWE-287", "CWE-798", "CWE-862", "CWE-863"].includes(cwe))
393
+ return "auth";
394
+ if (["CWE-327", "CWE-328", "CWE-916", "CWE-330"].includes(cwe))
395
+ return "crypto";
396
+ if (["CWE-502", "CWE-915", "CWE-471"].includes(cwe))
397
+ return "prototype-pollution";
398
+ if (["CWE-200", "CWE-209", "CWE-312", "CWE-532"].includes(cwe))
399
+ return "data-exposure";
400
+ if (["CWE-400", "CWE-770", "CWE-185", "CWE-1333"].includes(cwe))
401
+ return "denial-of-service";
402
+ if (["CWE-918"].includes(cwe))
403
+ return "ssrf";
404
+ if (["CWE-362", "CWE-667"].includes(cwe))
405
+ return "concurrency";
406
+ }
407
+ return "security";
408
+ }
409
+ // ─── Report Formatting ──────────────────────────────────────────────────────
410
+ export function formatOpenSSFReport(result) {
411
+ const lines = [];
412
+ lines.push("# OpenSSF CVE Benchmark Report");
413
+ lines.push("");
414
+ lines.push(`**Date:** ${result.timestamp}`);
415
+ lines.push(`**CVEs Evaluated:** ${result.evaluated} of ${result.totalCves} (${result.skipped} skipped)`);
416
+ lines.push("");
417
+ lines.push("## Summary");
418
+ lines.push("");
419
+ lines.push(`| Metric | Value |`);
420
+ lines.push(`|--------|-------|`);
421
+ lines.push(`| Detection Rate | ${(result.detectionRate * 100).toFixed(1)}% (${result.detected}/${result.evaluated}) |`);
422
+ lines.push(`| Precision | ${(result.precision * 100).toFixed(1)}% |`);
423
+ lines.push(`| Recall | ${(result.recall * 100).toFixed(1)}% |`);
424
+ lines.push(`| F1 Score | ${(result.f1Score * 100).toFixed(1)}% |`);
425
+ lines.push(`| Clean on Patch | ${result.cleanOnPatch}/${result.evaluated} |`);
426
+ lines.push(`| False Positives on Patch | ${result.falsePositiveOnPatch} |`);
427
+ lines.push("");
428
+ // Per-CWE breakdown
429
+ const cweEntries = Object.entries(result.perCwe).sort((a, b) => b[1].total - a[1].total);
430
+ if (cweEntries.length > 0) {
431
+ lines.push("## Per-CWE Detection Rates");
432
+ lines.push("");
433
+ lines.push("| CWE | Total | Detected | Rate |");
434
+ lines.push("|-----|-------|----------|------|");
435
+ for (const [cwe, data] of cweEntries) {
436
+ lines.push(`| ${cwe} | ${data.total} | ${data.detected} | ${(data.rate * 100).toFixed(0)}% |`);
437
+ }
438
+ lines.push("");
439
+ }
440
+ // Missed CVEs
441
+ const missed = result.results.filter((r) => !r.error && !r.detected);
442
+ if (missed.length > 0) {
443
+ lines.push("## Missed CVEs");
444
+ lines.push("");
445
+ for (const r of missed.slice(0, 20)) {
446
+ lines.push(`- **${r.cve}** (${r.cwes.join(", ")}): ${r.language}`);
447
+ }
448
+ if (missed.length > 20) {
449
+ lines.push(`- ... and ${missed.length - 20} more`);
450
+ }
451
+ lines.push("");
452
+ }
453
+ // False positives on patched code
454
+ const fpOnPatch = result.results.filter((r) => !r.error && !r.cleanOnPatch);
455
+ if (fpOnPatch.length > 0) {
456
+ lines.push("## False Positives on Patched Code");
457
+ lines.push("");
458
+ for (const r of fpOnPatch.slice(0, 10)) {
459
+ const fpRules = [...new Set(r.postPatchFindings.map((f) => f.ruleId))];
460
+ lines.push(`- **${r.cve}**: ${fpRules.join(", ")}`);
461
+ }
462
+ if (fpOnPatch.length > 10) {
463
+ lines.push(`- ... and ${fpOnPatch.length - 10} more`);
464
+ }
465
+ lines.push("");
466
+ }
467
+ return lines.join("\n");
468
+ }
469
+ // ─── CLI Entry Point ────────────────────────────────────────────────────────
470
+ export function runOpenSSFCveBenchmark(argv) {
471
+ const subcommand = argv[3] || "run";
472
+ if (subcommand === "--help" || subcommand === "-h") {
473
+ console.log(`
474
+ Judges Panel — OpenSSF CVE Benchmark
475
+
476
+ Evaluates Judges against 200+ real-world CVEs from the OpenSSF CVE Benchmark.
477
+ Requires the benchmark repo to be cloned locally.
478
+
479
+ USAGE:
480
+ judges openssf-cve run [options] Run benchmark against all CVEs
481
+ judges openssf-cve convert [options] Convert CVEs to BenchmarkCase format
482
+ judges openssf-cve report [options] Generate markdown report from results
483
+
484
+ OPTIONS:
485
+ --repo, -r <path> Path to the ossf-cve-benchmark repo (default: ../ossf-cve-benchmark)
486
+ --cve <id> Evaluate a single CVE (e.g. CVE-2018-16492)
487
+ --output, -o <path> Save results to file
488
+ --format <fmt> Output: text, json, markdown (default: text)
489
+
490
+ SETUP:
491
+ git clone https://github.com/ossf-cve-benchmark/ossf-cve-benchmark.git
492
+ cd ossf-cve-benchmark && npm i && npm run build
493
+ `);
494
+ process.exit(0);
495
+ }
496
+ let repoPath = resolve("..", "ossf-cve-benchmark");
497
+ let singleCve;
498
+ let outputPath;
499
+ let format = "text";
500
+ for (let i = 4; i < argv.length; i++) {
501
+ const arg = argv[i];
502
+ if (arg === "--repo" || arg === "-r")
503
+ repoPath = resolve(argv[++i]);
504
+ else if (arg === "--cve")
505
+ singleCve = argv[++i];
506
+ else if (arg === "--output" || arg === "-o")
507
+ outputPath = argv[++i];
508
+ else if (arg === "--format")
509
+ format = argv[++i];
510
+ }
511
+ if (!existsSync(repoPath)) {
512
+ console.error(`OpenSSF CVE Benchmark repo not found at: ${repoPath}`);
513
+ console.error("Clone it with:");
514
+ console.error(" git clone https://github.com/ossf-cve-benchmark/ossf-cve-benchmark.git");
515
+ process.exit(1);
516
+ }
517
+ const cves = loadCveFiles(repoPath);
518
+ console.log(`Loaded ${cves.length} published CVEs from ${repoPath}`);
519
+ const sourcesDir = ensureSourcesDir(repoPath);
520
+ if (subcommand === "convert") {
521
+ console.log("Converting CVEs to BenchmarkCase format...");
522
+ const cases = convertToBenchmarkCases(cves, sourcesDir);
523
+ const output = JSON.stringify(cases, null, 2);
524
+ if (outputPath) {
525
+ writeFileSync(outputPath, output, "utf-8");
526
+ console.log(`Wrote ${cases.length} benchmark cases to ${outputPath}`);
527
+ }
528
+ else {
529
+ console.log(output);
530
+ }
531
+ return;
532
+ }
533
+ // Run evaluation
534
+ const targetCves = singleCve ? cves.filter((c) => c.CVE === singleCve) : cves;
535
+ if (targetCves.length === 0) {
536
+ console.error(singleCve ? `CVE ${singleCve} not found in dataset` : "No CVEs to evaluate");
537
+ process.exit(1);
538
+ }
539
+ console.log(`Evaluating ${targetCves.length} CVEs...`);
540
+ const results = [];
541
+ for (let i = 0; i < targetCves.length; i++) {
542
+ const cve = targetCves[i];
543
+ const pct = Math.round(((i + 1) / targetCves.length) * 100);
544
+ process.stdout.write(`\r[${i + 1}/${targetCves.length}] ${pct}% ${cve.CVE}`);
545
+ const result = evaluateSingleCve(cve, sourcesDir);
546
+ results.push(result);
547
+ const icon = result.error ? "⚠️" : result.detected ? "✅" : "❌";
548
+ process.stdout.write(`\r[${i + 1}/${targetCves.length}] ${pct}% ${icon} ${cve.CVE} \n`);
549
+ }
550
+ const metrics = computeOpenSSFMetrics(results);
551
+ if (format === "json") {
552
+ const output = JSON.stringify(metrics, null, 2);
553
+ if (outputPath) {
554
+ writeFileSync(outputPath, output, "utf-8");
555
+ console.log(`Results saved to ${outputPath}`);
556
+ }
557
+ else {
558
+ console.log(output);
559
+ }
560
+ }
561
+ else if (format === "markdown") {
562
+ const report = formatOpenSSFReport(metrics);
563
+ if (outputPath) {
564
+ writeFileSync(outputPath, report, "utf-8");
565
+ console.log(`Report saved to ${outputPath}`);
566
+ }
567
+ else {
568
+ console.log(report);
569
+ }
570
+ }
571
+ else {
572
+ // Text summary
573
+ console.log("\n─── OpenSSF CVE Benchmark Results ───\n");
574
+ console.log(` Evaluated: ${metrics.evaluated}/${metrics.totalCves} CVEs`);
575
+ console.log(` Detected: ${metrics.detected} (${(metrics.detectionRate * 100).toFixed(1)}%)`);
576
+ console.log(` Missed: ${metrics.missed}`);
577
+ console.log(` Clean on Patch: ${metrics.cleanOnPatch}/${metrics.evaluated}`);
578
+ console.log(` FP on Patch: ${metrics.falsePositiveOnPatch}`);
579
+ console.log(` Precision: ${(metrics.precision * 100).toFixed(1)}%`);
580
+ console.log(` Recall: ${(metrics.recall * 100).toFixed(1)}%`);
581
+ console.log(` F1 Score: ${(metrics.f1Score * 100).toFixed(1)}%`);
582
+ if (outputPath) {
583
+ writeFileSync(outputPath, JSON.stringify(metrics, null, 2), "utf-8");
584
+ console.log(`\nFull results saved to ${outputPath}`);
585
+ }
586
+ }
587
+ }
588
+ // ─── Adapter Registration ───────────────────────────────────────────────────
589
+ function readJudgesVersion() {
590
+ try {
591
+ const pkg = JSON.parse(readFileSync(resolve("package.json"), "utf-8"));
592
+ return pkg.version ?? "unknown";
593
+ }
594
+ catch {
595
+ return "unknown";
596
+ }
597
+ }
598
+ /**
599
+ * Convert OpenSSFBenchmarkResult → ExternalBenchmarkResult for the registry.
600
+ */
601
+ export function toExternalResult(metrics) {
602
+ return {
603
+ suiteId: "openssf-cve",
604
+ suiteName: "OpenSSF CVE Benchmark",
605
+ suiteUrl: "https://github.com/ossf-cve-benchmark/ossf-cve-benchmark",
606
+ timestamp: metrics.timestamp,
607
+ judgesVersion: readJudgesVersion(),
608
+ totalItems: metrics.totalCves,
609
+ evaluatedItems: metrics.evaluated,
610
+ skippedItems: metrics.skipped,
611
+ precision: metrics.precision,
612
+ recall: metrics.recall,
613
+ f1Score: metrics.f1Score,
614
+ detectionRate: metrics.detectionRate,
615
+ truePositives: metrics.detected,
616
+ falsePositives: metrics.falsePositiveOnPatch,
617
+ falseNegatives: metrics.missed,
618
+ perCategory: metrics.perCwe,
619
+ rawData: metrics,
620
+ };
621
+ }
622
+ const openSSFAdapter = {
623
+ suiteId: "openssf-cve",
624
+ suiteName: "OpenSSF CVE Benchmark",
625
+ suiteUrl: "https://github.com/ossf-cve-benchmark/ossf-cve-benchmark",
626
+ defaultRepoPath: "../ossf-cve-benchmark",
627
+ description: "200+ real-world JS/TS CVEs with pre-patch and post-patch commits",
628
+ validate(repoPath) {
629
+ if (!existsSync(repoPath)) {
630
+ return `Repo not found at ${repoPath}. Clone with: git clone https://github.com/ossf-cve-benchmark/ossf-cve-benchmark.git`;
631
+ }
632
+ if (!existsSync(join(repoPath, "CVEs"))) {
633
+ return `CVEs directory not found at ${repoPath}/CVEs. Is this the correct repo?`;
634
+ }
635
+ return undefined;
636
+ },
637
+ run(config) {
638
+ const cves = loadCveFiles(config.repoPath);
639
+ console.log(` Loaded ${cves.length} published CVEs`);
640
+ const sourcesDir = ensureSourcesDir(config.repoPath);
641
+ const targetCves = config.singleItem ? cves.filter((c) => c.CVE === config.singleItem) : cves;
642
+ if (targetCves.length === 0) {
643
+ return toExternalResult(computeOpenSSFMetrics([]));
644
+ }
645
+ console.log(` Evaluating ${targetCves.length} CVEs...`);
646
+ const results = [];
647
+ for (let i = 0; i < targetCves.length; i++) {
648
+ const cve = targetCves[i];
649
+ const pct = Math.round(((i + 1) / targetCves.length) * 100);
650
+ process.stdout.write(`\r [${i + 1}/${targetCves.length}] ${pct}% ${cve.CVE}`);
651
+ const result = evaluateSingleCve(cve, sourcesDir);
652
+ results.push(result);
653
+ const icon = result.error ? "⚠️" : result.detected ? "✅" : "❌";
654
+ process.stdout.write(`\r [${i + 1}/${targetCves.length}] ${pct}% ${icon} ${cve.CVE} \n`);
655
+ }
656
+ return toExternalResult(computeOpenSSFMetrics(results));
657
+ },
658
+ };
659
+ registerBenchmarkAdapter(openSSFAdapter);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kevinrabun/judges-cli",
3
- "version": "3.128.3",
3
+ "version": "3.129.0",
4
4
  "description": "CLI wrapper for the Judges code review toolkit.",
5
5
  "type": "module",
6
6
  "main": "dist/cli.js",