@kevinrabun/judges 3.126.1 → 3.127.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/api.d.ts CHANGED
@@ -75,6 +75,7 @@ export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, sele
75
75
  export type { LlmBenchmarkSnapshot, LlmCaseResult } from "./commands/llm-benchmark.js";
76
76
  export type { LlmFinding, ValidationResult } from "./probabilistic/llm-response-validator.js";
77
77
  export type { PromptAmendment, OptimizerInsight, OptimizationResult, AmendmentStore, } from "./commands/llm-benchmark-optimizer.js";
78
+ export { optimizeBenchmark, mergeAmendments, createEmptyStore, formatAmendmentSection, } from "./commands/llm-benchmark-optimizer.js";
78
79
  export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
79
80
  export { buildContextSnippets } from "./context/context-snippets.js";
80
81
  export { EmbeddingCache, FallbackEmbeddingProvider, getOrCreateEmbedding } from "./context/embedding-cache.js";
package/dist/api.js CHANGED
@@ -83,6 +83,7 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
83
83
  export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
84
84
  // ─── LLM Benchmark ──────────────────────────────────────────────────────────
85
85
  export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, selectJudgesForCategory, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
86
+ export { optimizeBenchmark, mergeAmendments, createEmptyStore, formatAmendmentSection, } from "./commands/llm-benchmark-optimizer.js";
86
87
  // Review autopilot (GitHub App / scripts)
87
88
  export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
88
89
  export { buildContextSnippets } from "./context/context-snippets.js";
@@ -7,6 +7,7 @@
7
7
  export const COMMAND_TABLE = {
8
8
  "adoption-report": ["./commands/adoption-report.js", "runAdoptionReport"],
9
9
  "adoption-track": ["./commands/adoption-track.js", "runAdoptionTrack"],
10
+ "codify-amendments": ["./commands/codify-amendments.js", "runCodifyAmendments"],
10
11
  "ai-gate": ["./commands/ai-gate.js", "runAiGate"],
11
12
  "ai-model-trust": ["./commands/ai-model-trust.js", "runAiModelTrust"],
12
13
  "ai-output-compare": ["./commands/ai-output-compare.js", "runAiOutputCompare"],
@@ -0,0 +1,20 @@
1
+ /**
2
+ * `judges codify-amendments` — Bake benchmark-learned amendments into judge source files.
3
+ *
4
+ * Reads amendments from either:
5
+ * - The VS Code global storage (llm-benchmark-amendments.json)
6
+ * - A specified JSON file (--file path)
7
+ *
8
+ * For each amendment, appends a BENCHMARK-LEARNED section to the judge's
9
+ * FALSE POSITIVE AVOIDANCE block in the agent .judge.md file. Then runs
10
+ * generate:agents to sync .ts files.
11
+ *
12
+ * This makes self-teaching improvements part of the distributed package
13
+ * rather than local-only amendment files.
14
+ *
15
+ * Usage:
16
+ * judges codify-amendments # from VS Code global storage
17
+ * judges codify-amendments --file amend.json # from file
18
+ * judges codify-amendments --dry-run # preview without writing
19
+ */
20
+ export declare function runCodifyAmendments(argv: string[]): void;
@@ -0,0 +1,162 @@
1
+ /**
2
+ * `judges codify-amendments` — Bake benchmark-learned amendments into judge source files.
3
+ *
4
+ * Reads amendments from either:
5
+ * - The VS Code global storage (llm-benchmark-amendments.json)
6
+ * - A specified JSON file (--file path)
7
+ *
8
+ * For each amendment, appends a BENCHMARK-LEARNED section to the judge's
9
+ * FALSE POSITIVE AVOIDANCE block in the agent .judge.md file. Then runs
10
+ * generate:agents to sync .ts files.
11
+ *
12
+ * This makes self-teaching improvements part of the distributed package
13
+ * rather than local-only amendment files.
14
+ *
15
+ * Usage:
16
+ * judges codify-amendments # from VS Code global storage
17
+ * judges codify-amendments --file amend.json # from file
18
+ * judges codify-amendments --dry-run # preview without writing
19
+ */
20
+ import { readFileSync, writeFileSync, existsSync } from "fs";
21
+ import { resolve, join } from "path";
22
+ import { fileURLToPath } from "url";
23
+ import { dirname } from "path";
24
+ const __filename = fileURLToPath(import.meta.url);
25
+ const __dirname = dirname(__filename);
26
+ /**
27
+ * Find the agents directory relative to this source file.
28
+ */
29
+ function findAgentsDir() {
30
+ // In dist/commands/ → go up to repo root
31
+ const candidates = [
32
+ resolve(__dirname, "..", "..", "agents"),
33
+ resolve(__dirname, "..", "agents"),
34
+ resolve(process.cwd(), "agents"),
35
+ ];
36
+ for (const dir of candidates) {
37
+ if (existsSync(dir))
38
+ return dir;
39
+ }
40
+ throw new Error("Cannot find agents/ directory. Run from the repo root.");
41
+ }
42
+ /**
43
+ * Load amendments from a file or the default VS Code global storage location.
44
+ */
45
+ function loadAmendments(filePath) {
46
+ if (filePath) {
47
+ const store = JSON.parse(readFileSync(resolve(filePath), "utf8"));
48
+ return store.amendments;
49
+ }
50
+ // Try VS Code global storage
51
+ const appdata = process.env.APPDATA || process.env.HOME;
52
+ if (!appdata)
53
+ throw new Error("Cannot determine global storage path. Use --file to specify.");
54
+ const globalPath = join(appdata, "Code", "User", "globalStorage", "kevinrabun.judges-panel", "llm-benchmark-amendments.json");
55
+ if (!existsSync(globalPath)) {
56
+ throw new Error(`No amendments found at ${globalPath}. Run an LLM benchmark first, or use --file.`);
57
+ }
58
+ const store = JSON.parse(readFileSync(globalPath, "utf8"));
59
+ return store.amendments;
60
+ }
61
+ /**
62
+ * Codify a single amendment into a judge's .judge.md file by appending
63
+ * to the FALSE POSITIVE AVOIDANCE section (or creating one if missing).
64
+ */
65
+ function codifyAmendment(agentsDir, amendment, dryRun) {
66
+ // Map prefix to judge file
67
+ const files = existsSync(agentsDir) ? readdirSync(agentsDir) : [];
68
+ let targetFile;
69
+ for (const file of files) {
70
+ if (!file.endsWith(".judge.md"))
71
+ continue;
72
+ const content = readFileSync(join(agentsDir, file), "utf8");
73
+ // Match judge by rule prefix in frontmatter
74
+ if (content.includes(`rulePrefix: ${amendment.judgePrefix}`) ||
75
+ content.includes(`rulePrefix: "${amendment.judgePrefix}"`)) {
76
+ targetFile = file;
77
+ break;
78
+ }
79
+ }
80
+ if (!targetFile) {
81
+ console.error(` ⚠ No agent file found for prefix ${amendment.judgePrefix} — skipping`);
82
+ return false;
83
+ }
84
+ const filePath = join(agentsDir, targetFile);
85
+ let content = readFileSync(filePath, "utf8");
86
+ // Build the codified section
87
+ const codifiedBlock = [
88
+ "",
89
+ `BENCHMARK-LEARNED PRECISION GUIDANCE (${amendment.judgePrefix}):`,
90
+ `- ${amendment.amendment}`,
91
+ `- Source: ${amendment.generatedFrom} | FP rate: ${(amendment.fpRate * 100).toFixed(0)}% | Generated: ${amendment.timestamp.slice(0, 10)}`,
92
+ ].join("\n");
93
+ // Check if already codified for this prefix
94
+ if (content.includes(`BENCHMARK-LEARNED PRECISION GUIDANCE (${amendment.judgePrefix})`)) {
95
+ console.log(` ♻ ${targetFile} — already has codified amendment for ${amendment.judgePrefix}, replacing`);
96
+ // Replace existing block
97
+ content = content.replace(new RegExp(`\\nBENCHMARK-LEARNED PRECISION GUIDANCE \\(${amendment.judgePrefix}\\):[\\s\\S]*?(?=\\n[A-Z]|$)`), codifiedBlock);
98
+ }
99
+ else {
100
+ // Insert before ADVERSARIAL MANDATE if exists, otherwise append
101
+ if (content.includes("ADVERSARIAL MANDATE:")) {
102
+ content = content.replace("ADVERSARIAL MANDATE:", codifiedBlock + "\n\nADVERSARIAL MANDATE:");
103
+ }
104
+ else {
105
+ content = content.trimEnd() + "\n" + codifiedBlock + "\n";
106
+ }
107
+ }
108
+ if (dryRun) {
109
+ console.log(` 📝 [DRY RUN] Would update ${targetFile}`);
110
+ console.log(` ${codifiedBlock.split("\n").slice(1, 3).join("\n ")}`);
111
+ }
112
+ else {
113
+ writeFileSync(filePath, content);
114
+ console.log(` ✅ Updated ${targetFile}`);
115
+ }
116
+ return true;
117
+ }
118
+ import { readdirSync } from "fs";
119
+ export function runCodifyAmendments(argv) {
120
+ const dryRun = argv.includes("--dry-run");
121
+ const fileIdx = argv.indexOf("--file");
122
+ const filePath = fileIdx >= 0 ? argv[fileIdx + 1] : undefined;
123
+ console.log("");
124
+ console.log("╔══════════════════════════════════════════════════════════════╗");
125
+ console.log("║ Judges — Codify Benchmark Amendments ║");
126
+ console.log("╚══════════════════════════════════════════════════════════════╝");
127
+ console.log("");
128
+ let amendments;
129
+ try {
130
+ amendments = loadAmendments(filePath);
131
+ }
132
+ catch (e) {
133
+ console.error(`Error: ${e instanceof Error ? e.message : String(e)}`);
134
+ process.exit(1);
135
+ }
136
+ if (amendments.length === 0) {
137
+ console.log(" No amendments to codify. Run an LLM benchmark to generate them.");
138
+ process.exit(0);
139
+ }
140
+ console.log(` Found ${amendments.length} amendment(s) to codify${dryRun ? " (dry run)" : ""}:`);
141
+ for (const a of amendments) {
142
+ console.log(` ${a.judgePrefix}: ${a.reason}`);
143
+ }
144
+ console.log("");
145
+ const agentsDir = findAgentsDir();
146
+ let codified = 0;
147
+ for (const amendment of amendments) {
148
+ if (codifyAmendment(agentsDir, amendment, dryRun)) {
149
+ codified++;
150
+ }
151
+ }
152
+ console.log("");
153
+ console.log(` ${dryRun ? "Would codify" : "Codified"} ${codified}/${amendments.length} amendment(s) into agent files.`);
154
+ if (!dryRun && codified > 0) {
155
+ console.log(" Next steps:");
156
+ console.log(" 1. npm run generate:agents:force — sync .ts files from .judge.md");
157
+ console.log(" 2. npm run build — rebuild");
158
+ console.log(" 3. npm test — verify");
159
+ console.log(" 4. Commit and release");
160
+ }
161
+ console.log("");
162
+ }
@@ -11,9 +11,9 @@
11
11
  import { JUDGES } from "../judges/index.js";
12
12
  // ─── Thresholds ─────────────────────────────────────────────────────────────
13
13
  /** Judges below this precision get amendments */
14
- const AMENDMENT_PRECISION_THRESHOLD = 0.4;
14
+ const AMENDMENT_PRECISION_THRESHOLD = 0.7;
15
15
  /** Minimum findings before generating amendment (avoid noise) */
16
- const MIN_FINDINGS_FOR_AMENDMENT = 5;
16
+ const MIN_FINDINGS_FOR_AMENDMENT = 3;
17
17
  /** Categories below this F1 get flagged */
18
18
  const CATEGORY_F1_THRESHOLD = 0.5;
19
19
  /** Difficulty detection rate below this gets flagged */
@@ -148,13 +148,15 @@ export function parseLlmRuleIds(response) {
148
148
  const validPrefixes = getValidRulePrefixes();
149
149
  const pattern = /\b([A-Z][A-Z0-9]+)-(\d{1,3})\b/g;
150
150
  const found = new Set();
151
- // Split response into paragraphs/sections and skip sections that explicitly
152
- // declare zero findings rule IDs mentioned in "zero findings" rationale
153
- // are explanatory references, not actual detections.
154
- const sections = response.split(/\n{2,}/);
155
- const zeroFindingsPattern = /\*?\*?(?:ZERO|zero|0|no)\s+findings?\*?\*?|(?:findings?|issues?)[\s:]*\*?\*?(?:none|0|zero)\*?\*?|no\s+(?:issues?|findings?|problems?|concerns?)\s+(?:found|detected|identified|reported)/i;
151
+ // Split response into judge sections (separated by --- dividers or ## headers)
152
+ // and skip entire sections that declare zero/no findings. This prevents rule
153
+ // IDs mentioned in rationale text or findings tables of "clean" judge sections
154
+ // from being counted as detections.
155
+ const sections = response.split(/(?:^|\n)---\s*\n|(?=^## )/m);
156
+ const zeroFindingsPattern = /\*?\*?(?:ZERO|zero|0|no)\s+findings?\*?\*?|(?:findings?|issues?)[\s:]*\*?\*?(?:none|0|zero)\*?\*?|no\s+(?:issues?|findings?|problems?|concerns?)\s+(?:found|detected|identified|reported)|report(?:ing)?\s+zero|Score\s*[|:]\s*\*?\*?100\s*\/?\s*100\*?\*?/i;
156
157
  for (const section of sections) {
157
- // If this section explicitly declares zero/no findings, skip rule ID extraction
158
+ // If this section explicitly declares zero/no findings or a perfect score,
159
+ // skip rule ID extraction — any rule IDs are explanatory references
158
160
  if (zeroFindingsPattern.test(section))
159
161
  continue;
160
162
  let match;
@@ -165,13 +167,17 @@ export function parseLlmRuleIds(response) {
165
167
  }
166
168
  }
167
169
  }
168
- // Secondary pass on full text: extract known prefixes from compound IDs like DEPS-TYPO-001
169
- // These are almost always in findings tables, not rationale
170
- const compoundPattern = /\b([A-Z][A-Z0-9]+)-[A-Z][A-Z0-9]+-(\d{1,3})\b/g;
171
- let match;
172
- while ((match = compoundPattern.exec(response)) !== null) {
173
- if (validPrefixes.has(match[1])) {
174
- found.add(`${match[1]}-${match[2]}`);
170
+ // Secondary pass: extract compound IDs like DEPS-TYPO-001 from sections
171
+ // that did NOT declare zero findings (reuse the filtered sections)
172
+ for (const section of sections) {
173
+ if (zeroFindingsPattern.test(section))
174
+ continue;
175
+ const compoundPattern = /\b([A-Z][A-Z0-9]+)-[A-Z][A-Z0-9]+-(\d{1,3})\b/g;
176
+ let match;
177
+ while ((match = compoundPattern.exec(section)) !== null) {
178
+ if (validPrefixes.has(match[1])) {
179
+ found.add(`${match[1]}-${match[2]}`);
180
+ }
175
181
  }
176
182
  }
177
183
  return [...found];
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kevinrabun/judges",
3
- "version": "3.126.1",
3
+ "version": "3.127.0",
4
4
  "description": "45 specialized judges that evaluate AI-generated code for security, cost, and quality.",
5
5
  "mcpName": "io.github.KevinRabun/judges",
6
6
  "type": "module",
package/server.json CHANGED
@@ -16,12 +16,12 @@
16
16
  "mimeType": "image/png"
17
17
  }
18
18
  ],
19
- "version": "3.126.1",
19
+ "version": "3.127.0",
20
20
  "packages": [
21
21
  {
22
22
  "registryType": "npm",
23
23
  "identifier": "@kevinrabun/judges",
24
- "version": "3.126.1",
24
+ "version": "3.127.0",
25
25
  "transport": {
26
26
  "type": "stdio"
27
27
  }