@kevinrabun/judges-cli 3.128.3 → 3.129.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.d.ts +1 -0
- package/dist/api.js +2 -0
- package/dist/cli-dispatch.js +2 -0
- package/dist/cli.js +2 -0
- package/dist/commands/codify-amendments.js +28 -5
- package/dist/commands/external-benchmarks.d.ts +118 -0
- package/dist/commands/external-benchmarks.js +296 -0
- package/dist/commands/martian-code-review-benchmark.d.ts +61 -0
- package/dist/commands/martian-code-review-benchmark.js +689 -0
- package/dist/commands/openssf-cve-benchmark.d.ts +96 -0
- package/dist/commands/openssf-cve-benchmark.js +659 -0
- package/package.json +1 -1
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenSSF CVE Benchmark Integration
|
|
3
|
+
*
|
|
4
|
+
* Runs the Judges evaluation engine against the OpenSSF CVE Benchmark dataset
|
|
5
|
+
* (https://github.com/ossf-cve-benchmark/ossf-cve-benchmark) — 200+ real-world
|
|
6
|
+
* JavaScript/TypeScript CVEs with pre-patch (vulnerable) and post-patch (fixed)
|
|
7
|
+
* git commits.
|
|
8
|
+
*
|
|
9
|
+
* Two modes:
|
|
10
|
+
* 1. Deterministic (L1): Runs Judges' pattern-based evaluators against each CVE.
|
|
11
|
+
* 2. LLM integration: Converts passing CVE cases into BenchmarkCase format
|
|
12
|
+
* for inclusion in the LLM benchmark pipeline.
|
|
13
|
+
*
|
|
14
|
+
* Usage:
|
|
15
|
+
* judges openssf-cve run [--repo <path>] [--cve <id>] [--format json|text|markdown]
|
|
16
|
+
* judges openssf-cve convert [--repo <path>] # Convert to BenchmarkCase[]
|
|
17
|
+
*/
|
|
18
|
+
import type { Finding } from "../types.js";
|
|
19
|
+
import type { BenchmarkCase } from "./benchmark.js";
|
|
20
|
+
import type { ExternalBenchmarkResult } from "./external-benchmarks.js";
|
|
21
|
+
/** Raw JSON from a CVE file in the OpenSSF benchmark repo */
|
|
22
|
+
export interface OpenSSFCve {
|
|
23
|
+
CVE: string;
|
|
24
|
+
state: "PUBLISHED" | "DRAFT" | "RESERVED";
|
|
25
|
+
repository: string;
|
|
26
|
+
prePatch: {
|
|
27
|
+
commit: string;
|
|
28
|
+
weaknesses: Array<{
|
|
29
|
+
location: {
|
|
30
|
+
file: string;
|
|
31
|
+
line: number;
|
|
32
|
+
};
|
|
33
|
+
explanation: string;
|
|
34
|
+
}>;
|
|
35
|
+
};
|
|
36
|
+
postPatch: {
|
|
37
|
+
commit: string;
|
|
38
|
+
};
|
|
39
|
+
CWEs: string[];
|
|
40
|
+
}
|
|
41
|
+
export interface CveEvalResult {
|
|
42
|
+
cve: string;
|
|
43
|
+
cwes: string[];
|
|
44
|
+
language: string;
|
|
45
|
+
/** Did Judges detect at least one finding matching a relevant CWE? */
|
|
46
|
+
detected: boolean;
|
|
47
|
+
/** Did Judges produce no false positives on the patched version? */
|
|
48
|
+
cleanOnPatch: boolean;
|
|
49
|
+
/** Relevant findings on the pre-patch (vulnerable) code */
|
|
50
|
+
prePatchFindings: Finding[];
|
|
51
|
+
/** Findings on the post-patch (fixed) code — ideally empty */
|
|
52
|
+
postPatchFindings: Finding[];
|
|
53
|
+
/** Which CWEs from the CVE were matched by findings */
|
|
54
|
+
matchedCwes: string[];
|
|
55
|
+
/** Which CWEs from the CVE were NOT matched */
|
|
56
|
+
missedCwes: string[];
|
|
57
|
+
/** Error message if evaluation failed */
|
|
58
|
+
error?: string;
|
|
59
|
+
}
|
|
60
|
+
export interface OpenSSFBenchmarkResult {
|
|
61
|
+
timestamp: string;
|
|
62
|
+
totalCves: number;
|
|
63
|
+
evaluated: number;
|
|
64
|
+
skipped: number;
|
|
65
|
+
detected: number;
|
|
66
|
+
missed: number;
|
|
67
|
+
cleanOnPatch: number;
|
|
68
|
+
falsePositiveOnPatch: number;
|
|
69
|
+
detectionRate: number;
|
|
70
|
+
precision: number;
|
|
71
|
+
recall: number;
|
|
72
|
+
f1Score: number;
|
|
73
|
+
perCwe: Record<string, {
|
|
74
|
+
total: number;
|
|
75
|
+
detected: number;
|
|
76
|
+
rate: number;
|
|
77
|
+
}>;
|
|
78
|
+
results: CveEvalResult[];
|
|
79
|
+
}
|
|
80
|
+
export declare function loadCveFiles(repoPath: string): OpenSSFCve[];
|
|
81
|
+
export declare function evaluateSingleCve(cve: OpenSSFCve, sourcesDir: string): CveEvalResult;
|
|
82
|
+
export declare function computeOpenSSFMetrics(results: CveEvalResult[]): OpenSSFBenchmarkResult;
|
|
83
|
+
/**
|
|
84
|
+
* Convert OpenSSF CVE results into BenchmarkCase[] format for use
|
|
85
|
+
* in the Judges LLM benchmark pipeline. Only includes CVEs where:
|
|
86
|
+
* - The vulnerable code was successfully checked out
|
|
87
|
+
* - At least one weakness file was found
|
|
88
|
+
* - CWEs map to known judge prefixes
|
|
89
|
+
*/
|
|
90
|
+
export declare function convertToBenchmarkCases(cves: OpenSSFCve[], sourcesDir: string): BenchmarkCase[];
|
|
91
|
+
export declare function formatOpenSSFReport(result: OpenSSFBenchmarkResult): string;
|
|
92
|
+
export declare function runOpenSSFCveBenchmark(argv: string[]): void;
|
|
93
|
+
/**
|
|
94
|
+
* Convert OpenSSFBenchmarkResult → ExternalBenchmarkResult for the registry.
|
|
95
|
+
*/
|
|
96
|
+
export declare function toExternalResult(metrics: OpenSSFBenchmarkResult): ExternalBenchmarkResult;
|