@kevinrabun/judges 3.126.2 → 3.127.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.d.ts +1 -0
- package/dist/api.js +1 -0
- package/dist/cli-dispatch.js +1 -0
- package/dist/commands/codify-amendments.d.ts +20 -0
- package/dist/commands/codify-amendments.js +162 -0
- package/dist/commands/llm-benchmark-optimizer.js +30 -27
- package/package.json +1 -1
- package/server.json +2 -2
package/dist/api.d.ts
CHANGED
|
@@ -75,6 +75,7 @@ export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, sele
|
|
|
75
75
|
export type { LlmBenchmarkSnapshot, LlmCaseResult } from "./commands/llm-benchmark.js";
|
|
76
76
|
export type { LlmFinding, ValidationResult } from "./probabilistic/llm-response-validator.js";
|
|
77
77
|
export type { PromptAmendment, OptimizerInsight, OptimizationResult, AmendmentStore, } from "./commands/llm-benchmark-optimizer.js";
|
|
78
|
+
export { optimizeBenchmark, mergeAmendments, createEmptyStore, formatAmendmentSection, } from "./commands/llm-benchmark-optimizer.js";
|
|
78
79
|
export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
|
|
79
80
|
export { buildContextSnippets } from "./context/context-snippets.js";
|
|
80
81
|
export { EmbeddingCache, FallbackEmbeddingProvider, getOrCreateEmbedding } from "./context/embedding-cache.js";
|
package/dist/api.js
CHANGED
|
@@ -83,6 +83,7 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
|
|
|
83
83
|
export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
|
|
84
84
|
// ─── LLM Benchmark ──────────────────────────────────────────────────────────
|
|
85
85
|
export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, selectJudgesForCategory, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
|
|
86
|
+
export { optimizeBenchmark, mergeAmendments, createEmptyStore, formatAmendmentSection, } from "./commands/llm-benchmark-optimizer.js";
|
|
86
87
|
// Review autopilot (GitHub App / scripts)
|
|
87
88
|
export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
|
|
88
89
|
export { buildContextSnippets } from "./context/context-snippets.js";
|
package/dist/cli-dispatch.js
CHANGED
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
export const COMMAND_TABLE = {
|
|
8
8
|
"adoption-report": ["./commands/adoption-report.js", "runAdoptionReport"],
|
|
9
9
|
"adoption-track": ["./commands/adoption-track.js", "runAdoptionTrack"],
|
|
10
|
+
"codify-amendments": ["./commands/codify-amendments.js", "runCodifyAmendments"],
|
|
10
11
|
"ai-gate": ["./commands/ai-gate.js", "runAiGate"],
|
|
11
12
|
"ai-model-trust": ["./commands/ai-model-trust.js", "runAiModelTrust"],
|
|
12
13
|
"ai-output-compare": ["./commands/ai-output-compare.js", "runAiOutputCompare"],
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `judges codify-amendments` — Bake benchmark-learned amendments into judge source files.
|
|
3
|
+
*
|
|
4
|
+
* Reads amendments from either:
|
|
5
|
+
* - The VS Code global storage (llm-benchmark-amendments.json)
|
|
6
|
+
* - A specified JSON file (--file path)
|
|
7
|
+
*
|
|
8
|
+
* For each amendment, appends a BENCHMARK-LEARNED section to the judge's
|
|
9
|
+
* FALSE POSITIVE AVOIDANCE block in the agent .judge.md file. Then runs
|
|
10
|
+
* generate:agents to sync .ts files.
|
|
11
|
+
*
|
|
12
|
+
* This makes self-teaching improvements part of the distributed package
|
|
13
|
+
* rather than local-only amendment files.
|
|
14
|
+
*
|
|
15
|
+
* Usage:
|
|
16
|
+
* judges codify-amendments # from VS Code global storage
|
|
17
|
+
* judges codify-amendments --file amend.json # from file
|
|
18
|
+
* judges codify-amendments --dry-run # preview without writing
|
|
19
|
+
*/
|
|
20
|
+
export declare function runCodifyAmendments(argv: string[]): void;
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `judges codify-amendments` — Bake benchmark-learned amendments into judge source files.
|
|
3
|
+
*
|
|
4
|
+
* Reads amendments from either:
|
|
5
|
+
* - The VS Code global storage (llm-benchmark-amendments.json)
|
|
6
|
+
* - A specified JSON file (--file path)
|
|
7
|
+
*
|
|
8
|
+
* For each amendment, appends a BENCHMARK-LEARNED section to the judge's
|
|
9
|
+
* FALSE POSITIVE AVOIDANCE block in the agent .judge.md file. Then runs
|
|
10
|
+
* generate:agents to sync .ts files.
|
|
11
|
+
*
|
|
12
|
+
* This makes self-teaching improvements part of the distributed package
|
|
13
|
+
* rather than local-only amendment files.
|
|
14
|
+
*
|
|
15
|
+
* Usage:
|
|
16
|
+
* judges codify-amendments # from VS Code global storage
|
|
17
|
+
* judges codify-amendments --file amend.json # from file
|
|
18
|
+
* judges codify-amendments --dry-run # preview without writing
|
|
19
|
+
*/
|
|
20
|
+
import { readFileSync, writeFileSync, existsSync } from "fs";
|
|
21
|
+
import { resolve, join } from "path";
|
|
22
|
+
import { fileURLToPath } from "url";
|
|
23
|
+
import { dirname } from "path";
|
|
24
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
25
|
+
const __dirname = dirname(__filename);
|
|
26
|
+
/**
|
|
27
|
+
* Find the agents directory relative to this source file.
|
|
28
|
+
*/
|
|
29
|
+
function findAgentsDir() {
|
|
30
|
+
// In dist/commands/ → go up to repo root
|
|
31
|
+
const candidates = [
|
|
32
|
+
resolve(__dirname, "..", "..", "agents"),
|
|
33
|
+
resolve(__dirname, "..", "agents"),
|
|
34
|
+
resolve(process.cwd(), "agents"),
|
|
35
|
+
];
|
|
36
|
+
for (const dir of candidates) {
|
|
37
|
+
if (existsSync(dir))
|
|
38
|
+
return dir;
|
|
39
|
+
}
|
|
40
|
+
throw new Error("Cannot find agents/ directory. Run from the repo root.");
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Load amendments from a file or the default VS Code global storage location.
|
|
44
|
+
*/
|
|
45
|
+
function loadAmendments(filePath) {
|
|
46
|
+
if (filePath) {
|
|
47
|
+
const store = JSON.parse(readFileSync(resolve(filePath), "utf8"));
|
|
48
|
+
return store.amendments;
|
|
49
|
+
}
|
|
50
|
+
// Try VS Code global storage
|
|
51
|
+
const appdata = process.env.APPDATA || process.env.HOME;
|
|
52
|
+
if (!appdata)
|
|
53
|
+
throw new Error("Cannot determine global storage path. Use --file to specify.");
|
|
54
|
+
const globalPath = join(appdata, "Code", "User", "globalStorage", "kevinrabun.judges-panel", "llm-benchmark-amendments.json");
|
|
55
|
+
if (!existsSync(globalPath)) {
|
|
56
|
+
throw new Error(`No amendments found at ${globalPath}. Run an LLM benchmark first, or use --file.`);
|
|
57
|
+
}
|
|
58
|
+
const store = JSON.parse(readFileSync(globalPath, "utf8"));
|
|
59
|
+
return store.amendments;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Codify a single amendment into a judge's .judge.md file by appending
|
|
63
|
+
* to the FALSE POSITIVE AVOIDANCE section (or creating one if missing).
|
|
64
|
+
*/
|
|
65
|
+
function codifyAmendment(agentsDir, amendment, dryRun) {
|
|
66
|
+
// Map prefix to judge file
|
|
67
|
+
const files = existsSync(agentsDir) ? readdirSync(agentsDir) : [];
|
|
68
|
+
let targetFile;
|
|
69
|
+
for (const file of files) {
|
|
70
|
+
if (!file.endsWith(".judge.md"))
|
|
71
|
+
continue;
|
|
72
|
+
const content = readFileSync(join(agentsDir, file), "utf8");
|
|
73
|
+
// Match judge by rule prefix in frontmatter
|
|
74
|
+
if (content.includes(`rulePrefix: ${amendment.judgePrefix}`) ||
|
|
75
|
+
content.includes(`rulePrefix: "${amendment.judgePrefix}"`)) {
|
|
76
|
+
targetFile = file;
|
|
77
|
+
break;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
if (!targetFile) {
|
|
81
|
+
console.error(` ⚠ No agent file found for prefix ${amendment.judgePrefix} — skipping`);
|
|
82
|
+
return false;
|
|
83
|
+
}
|
|
84
|
+
const filePath = join(agentsDir, targetFile);
|
|
85
|
+
let content = readFileSync(filePath, "utf8");
|
|
86
|
+
// Build the codified section
|
|
87
|
+
const codifiedBlock = [
|
|
88
|
+
"",
|
|
89
|
+
`BENCHMARK-LEARNED PRECISION GUIDANCE (${amendment.judgePrefix}):`,
|
|
90
|
+
`- ${amendment.amendment}`,
|
|
91
|
+
`- Source: ${amendment.generatedFrom} | FP rate: ${(amendment.fpRate * 100).toFixed(0)}% | Generated: ${amendment.timestamp.slice(0, 10)}`,
|
|
92
|
+
].join("\n");
|
|
93
|
+
// Check if already codified for this prefix
|
|
94
|
+
if (content.includes(`BENCHMARK-LEARNED PRECISION GUIDANCE (${amendment.judgePrefix})`)) {
|
|
95
|
+
console.log(` ♻ ${targetFile} — already has codified amendment for ${amendment.judgePrefix}, replacing`);
|
|
96
|
+
// Replace existing block
|
|
97
|
+
content = content.replace(new RegExp(`\\nBENCHMARK-LEARNED PRECISION GUIDANCE \\(${amendment.judgePrefix}\\):[\\s\\S]*?(?=\\n[A-Z]|$)`), codifiedBlock);
|
|
98
|
+
}
|
|
99
|
+
else {
|
|
100
|
+
// Insert before ADVERSARIAL MANDATE if exists, otherwise append
|
|
101
|
+
if (content.includes("ADVERSARIAL MANDATE:")) {
|
|
102
|
+
content = content.replace("ADVERSARIAL MANDATE:", codifiedBlock + "\n\nADVERSARIAL MANDATE:");
|
|
103
|
+
}
|
|
104
|
+
else {
|
|
105
|
+
content = content.trimEnd() + "\n" + codifiedBlock + "\n";
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
if (dryRun) {
|
|
109
|
+
console.log(` 📝 [DRY RUN] Would update ${targetFile}`);
|
|
110
|
+
console.log(` ${codifiedBlock.split("\n").slice(1, 3).join("\n ")}`);
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
writeFileSync(filePath, content);
|
|
114
|
+
console.log(` ✅ Updated ${targetFile}`);
|
|
115
|
+
}
|
|
116
|
+
return true;
|
|
117
|
+
}
|
|
118
|
+
import { readdirSync } from "fs";
|
|
119
|
+
export function runCodifyAmendments(argv) {
|
|
120
|
+
const dryRun = argv.includes("--dry-run");
|
|
121
|
+
const fileIdx = argv.indexOf("--file");
|
|
122
|
+
const filePath = fileIdx >= 0 ? argv[fileIdx + 1] : undefined;
|
|
123
|
+
console.log("");
|
|
124
|
+
console.log("╔══════════════════════════════════════════════════════════════╗");
|
|
125
|
+
console.log("║ Judges — Codify Benchmark Amendments ║");
|
|
126
|
+
console.log("╚══════════════════════════════════════════════════════════════╝");
|
|
127
|
+
console.log("");
|
|
128
|
+
let amendments;
|
|
129
|
+
try {
|
|
130
|
+
amendments = loadAmendments(filePath);
|
|
131
|
+
}
|
|
132
|
+
catch (e) {
|
|
133
|
+
console.error(`Error: ${e instanceof Error ? e.message : String(e)}`);
|
|
134
|
+
process.exit(1);
|
|
135
|
+
}
|
|
136
|
+
if (amendments.length === 0) {
|
|
137
|
+
console.log(" No amendments to codify. Run an LLM benchmark to generate them.");
|
|
138
|
+
process.exit(0);
|
|
139
|
+
}
|
|
140
|
+
console.log(` Found ${amendments.length} amendment(s) to codify${dryRun ? " (dry run)" : ""}:`);
|
|
141
|
+
for (const a of amendments) {
|
|
142
|
+
console.log(` ${a.judgePrefix}: ${a.reason}`);
|
|
143
|
+
}
|
|
144
|
+
console.log("");
|
|
145
|
+
const agentsDir = findAgentsDir();
|
|
146
|
+
let codified = 0;
|
|
147
|
+
for (const amendment of amendments) {
|
|
148
|
+
if (codifyAmendment(agentsDir, amendment, dryRun)) {
|
|
149
|
+
codified++;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
console.log("");
|
|
153
|
+
console.log(` ${dryRun ? "Would codify" : "Codified"} ${codified}/${amendments.length} amendment(s) into agent files.`);
|
|
154
|
+
if (!dryRun && codified > 0) {
|
|
155
|
+
console.log(" Next steps:");
|
|
156
|
+
console.log(" 1. npm run generate:agents:force — sync .ts files from .judge.md");
|
|
157
|
+
console.log(" 2. npm run build — rebuild");
|
|
158
|
+
console.log(" 3. npm test — verify");
|
|
159
|
+
console.log(" 4. Commit and release");
|
|
160
|
+
}
|
|
161
|
+
console.log("");
|
|
162
|
+
}
|
|
@@ -11,9 +11,9 @@
|
|
|
11
11
|
import { JUDGES } from "../judges/index.js";
|
|
12
12
|
// ─── Thresholds ─────────────────────────────────────────────────────────────
|
|
13
13
|
/** Judges below this precision get amendments */
|
|
14
|
-
const AMENDMENT_PRECISION_THRESHOLD = 0.
|
|
14
|
+
const AMENDMENT_PRECISION_THRESHOLD = 0.7;
|
|
15
15
|
/** Minimum findings before generating amendment (avoid noise) */
|
|
16
|
-
const MIN_FINDINGS_FOR_AMENDMENT =
|
|
16
|
+
const MIN_FINDINGS_FOR_AMENDMENT = 3;
|
|
17
17
|
/** Categories below this F1 get flagged */
|
|
18
18
|
const CATEGORY_F1_THRESHOLD = 0.5;
|
|
19
19
|
/** Difficulty detection rate below this gets flagged */
|
|
@@ -109,6 +109,7 @@ function generateAmendment(prefix, precision, fpCount, total, snapshot) {
|
|
|
109
109
|
const domain = judge?.domain ?? "its domain";
|
|
110
110
|
// Analyze what the FPs look like — which categories get falsely flagged
|
|
111
111
|
const fpCategories = new Map();
|
|
112
|
+
const tpCategories = new Map();
|
|
112
113
|
// Collect specific FP case IDs for pattern extraction
|
|
113
114
|
const fpCaseExamples = [];
|
|
114
115
|
for (const c of snapshot.cases) {
|
|
@@ -120,35 +121,36 @@ function generateAmendment(prefix, precision, fpCount, total, snapshot) {
|
|
|
120
121
|
}
|
|
121
122
|
}
|
|
122
123
|
}
|
|
124
|
+
// Also track where this judge produces TRUE positives
|
|
125
|
+
for (const det of c.detectedRuleIds) {
|
|
126
|
+
if (det.startsWith(prefix + "-") && !c.falsePositiveRuleIds.includes(det)) {
|
|
127
|
+
tpCategories.set(c.category, (tpCategories.get(c.category) ?? 0) + 1);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
123
130
|
}
|
|
124
|
-
|
|
131
|
+
// Identify categories that are FP-only (no TPs) — safe to suppress
|
|
132
|
+
const fpOnlyCategories = [...fpCategories.entries()]
|
|
133
|
+
.filter(([cat]) => !tpCategories.has(cat))
|
|
125
134
|
.sort((a, b) => b[1] - a[1])
|
|
126
135
|
.slice(0, 5)
|
|
127
136
|
.map(([cat]) => cat);
|
|
128
|
-
// Build
|
|
129
|
-
const categoryBlocklist = topFpCategories.length > 0
|
|
130
|
-
? `\nDo NOT report ${prefix}- findings on code in these categories: ${topFpCategories.join(", ")}. ` +
|
|
131
|
-
`These categories fall outside ${domain} and historically produce false positives.`
|
|
132
|
-
: "";
|
|
133
|
-
// Extract specific FP patterns for concrete guidance
|
|
134
|
-
const fpRuleIds = new Set(fpCaseExamples.map((e) => e.ruleId));
|
|
135
|
-
const specificRules = [...fpRuleIds].slice(0, 5).join(", ");
|
|
136
|
-
const ruleWarning = specificRules
|
|
137
|
-
? `\nSpecific rule IDs with high FP rates: ${specificRules}. Require >=80% confidence with exact line citations before reporting these.`
|
|
138
|
-
: "";
|
|
139
|
-
// Identify if clean cases are a problem for this judge
|
|
137
|
+
// Build targeted anti-FP instructions — only suppress on clean/FP-only categories
|
|
140
138
|
const cleanFPs = fpCaseExamples.filter((e) => e.category === "clean" || e.category.startsWith("ai-negative")).length;
|
|
139
|
+
const nonCleanFPOnlyWarning = fpOnlyCategories.length > 0
|
|
140
|
+
? `\nHistorically produces false positives on: ${fpOnlyCategories.join(", ")}. Apply extra scrutiny on these categories — require concrete evidence before reporting.`
|
|
141
|
+
: "";
|
|
141
142
|
const cleanWarning = cleanFPs > 0
|
|
142
|
-
? `\nThis judge produced ${cleanFPs} false
|
|
143
|
+
? `\nThis judge produced ${cleanFPs} false positive(s) on CLEAN code. If code uses standard patterns correctly (proper error handling, established libraries, framework conventions), report ZERO ${prefix}- findings. Clean, well-written code exists — do not manufacture findings.`
|
|
143
144
|
: "";
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
`
|
|
148
|
-
|
|
149
|
-
|
|
145
|
+
// IMPORTANT: Do NOT restrict the judge from detecting real issues in vulnerable code.
|
|
146
|
+
// Only add caution for clean-code patterns, not a blanket confidence floor.
|
|
147
|
+
const amendment = `PRECISION CALIBRATION for ${judgeName} (${prefix}-): ` +
|
|
148
|
+
`Empirical precision: ${pct(precision)} in recent benchmarks. ` +
|
|
149
|
+
`IMPORTANT: Continue detecting genuine ${domain} issues in vulnerable code — do NOT reduce sensitivity to real problems. ` +
|
|
150
|
+
`CALIBRATION: The false positives come from flagging well-written code that correctly uses established patterns. ` +
|
|
151
|
+
`Before reporting ${prefix}- findings, verify the code actually has a deficiency — not just a theoretical improvement opportunity.` +
|
|
150
152
|
cleanWarning +
|
|
151
|
-
|
|
153
|
+
nonCleanFPOnlyWarning;
|
|
152
154
|
return {
|
|
153
155
|
judgePrefix: prefix,
|
|
154
156
|
amendment,
|
|
@@ -167,11 +169,12 @@ export function formatAmendmentSection(amendments) {
|
|
|
167
169
|
if (amendments.length === 0)
|
|
168
170
|
return "";
|
|
169
171
|
const lines = [
|
|
170
|
-
"## Precision
|
|
172
|
+
"## Precision Calibration — Based on Empirical Benchmark Data",
|
|
171
173
|
"",
|
|
172
|
-
"The following judges have
|
|
173
|
-
"Apply
|
|
174
|
-
"
|
|
174
|
+
"The following judges have historically produced false positives on clean code. " +
|
|
175
|
+
"Apply the calibration guidance below to avoid repeating these errors. " +
|
|
176
|
+
"IMPORTANT: These calibrations target CLEAN CODE false positives only — " +
|
|
177
|
+
"continue detecting genuine issues in vulnerable code with full sensitivity.",
|
|
175
178
|
"",
|
|
176
179
|
];
|
|
177
180
|
for (const a of amendments) {
|
package/package.json
CHANGED
package/server.json
CHANGED
|
@@ -16,12 +16,12 @@
|
|
|
16
16
|
"mimeType": "image/png"
|
|
17
17
|
}
|
|
18
18
|
],
|
|
19
|
-
"version": "3.
|
|
19
|
+
"version": "3.127.1",
|
|
20
20
|
"packages": [
|
|
21
21
|
{
|
|
22
22
|
"registryType": "npm",
|
|
23
23
|
"identifier": "@kevinrabun/judges",
|
|
24
|
-
"version": "3.
|
|
24
|
+
"version": "3.127.1",
|
|
25
25
|
"transport": {
|
|
26
26
|
"type": "stdio"
|
|
27
27
|
}
|