@kevinrabun/judges-cli 3.127.0 → 3.127.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # @kevinrabun/judges-cli
2
2
 
3
- Standalone CLI package for Judges.
3
+ Standalone CLI package for the [Judges Panel](https://github.com/KevinRabun/judges) — 45 specialized judges that evaluate code for security, quality, compliance, and 40 more dimensions.
4
4
 
5
5
  ## Install
6
6
 
@@ -11,14 +11,46 @@ npm install -g @kevinrabun/judges-cli
11
11
  ## Usage
12
12
 
13
13
  ```bash
14
+ # Evaluate code
14
15
  judges eval src/app.ts
16
+ judges eval src/ --format sarif --output report.sarif
17
+ judges eval src/app.ts --judge cybersecurity
18
+ judges eval src/app.ts --preset strict --fail-on-findings
19
+
20
+ # List judges and regulatory frameworks
15
21
  judges list
16
- judges hook install
22
+ judges list --frameworks
23
+
24
+ # Auto-fix findings
25
+ judges fix src/app.ts --apply
17
26
 
18
27
  # Agentic skills
19
28
  judges skill ai-code-review --file src/app.ts
20
29
  judges skill security-review --file src/api.ts --format json
21
- judges skills # list available skills
30
+ judges skills
31
+
32
+ # Self-teaching
33
+ judges codify-amendments # bake benchmark amendments into judge files
34
+ judges codify-amendments --dry-run
22
35
  ```
23
36
 
24
- Use `@kevinrabun/judges` when you need the MCP server or programmatic API.
37
+ ## Configuration
38
+
39
+ Create a `.judgesrc.json` in your project root:
40
+
41
+ ```json
42
+ {
43
+ "preset": "strict",
44
+ "regulatoryScope": ["GDPR", "PCI-DSS"],
45
+ "disabledJudges": ["accessibility"],
46
+ "failOnFindings": true
47
+ }
48
+ ```
49
+
50
+ See the [full configuration reference](https://github.com/KevinRabun/judges#configuration) for all options.
51
+
52
+ ## Packages
53
+
54
+ - **`@kevinrabun/judges-cli`** — This package. Binary `judges` for CI/CD pipelines.
55
+ - **`@kevinrabun/judges`** — Programmatic API + MCP server.
56
+ - **VS Code extension** — [`kevinrabun.judges-panel`](https://marketplace.visualstudio.com/items?itemName=kevinrabun.judges-panel).
@@ -109,6 +109,7 @@ function generateAmendment(prefix, precision, fpCount, total, snapshot) {
109
109
  const domain = judge?.domain ?? "its domain";
110
110
  // Analyze what the FPs look like — which categories get falsely flagged
111
111
  const fpCategories = new Map();
112
+ const tpCategories = new Map();
112
113
  // Collect specific FP case IDs for pattern extraction
113
114
  const fpCaseExamples = [];
114
115
  for (const c of snapshot.cases) {
@@ -120,35 +121,36 @@ function generateAmendment(prefix, precision, fpCount, total, snapshot) {
120
121
  }
121
122
  }
122
123
  }
124
+ // Also track where this judge produces TRUE positives
125
+ for (const det of c.detectedRuleIds) {
126
+ if (det.startsWith(prefix + "-") && !c.falsePositiveRuleIds.includes(det)) {
127
+ tpCategories.set(c.category, (tpCategories.get(c.category) ?? 0) + 1);
128
+ }
129
+ }
123
130
  }
124
- const topFpCategories = [...fpCategories.entries()]
131
+ // Identify categories that are FP-only (no TPs) — safe to suppress
132
+ const fpOnlyCategories = [...fpCategories.entries()]
133
+ .filter(([cat]) => !tpCategories.has(cat))
125
134
  .sort((a, b) => b[1] - a[1])
126
135
  .slice(0, 5)
127
136
  .map(([cat]) => cat);
128
- // Build specific anti-FP instructions based on observed patterns
129
- const categoryBlocklist = topFpCategories.length > 0
130
- ? `\nDo NOT report ${prefix}- findings on code in these categories: ${topFpCategories.join(", ")}. ` +
131
- `These categories fall outside ${domain} and historically produce false positives.`
132
- : "";
133
- // Extract specific FP patterns for concrete guidance
134
- const fpRuleIds = new Set(fpCaseExamples.map((e) => e.ruleId));
135
- const specificRules = [...fpRuleIds].slice(0, 5).join(", ");
136
- const ruleWarning = specificRules
137
- ? `\nSpecific rule IDs with high FP rates: ${specificRules}. Require >=80% confidence with exact line citations before reporting these.`
138
- : "";
139
- // Identify if clean cases are a problem for this judge
137
+ // Build targeted anti-FP instructions only suppress on clean/FP-only categories
140
138
  const cleanFPs = fpCaseExamples.filter((e) => e.category === "clean" || e.category.startsWith("ai-negative")).length;
139
+ const nonCleanFPOnlyWarning = fpOnlyCategories.length > 0
140
+ ? `\nHistorically produces false positives on: ${fpOnlyCategories.join(", ")}. Apply extra scrutiny on these categories — require concrete evidence before reporting.`
141
+ : "";
141
142
  const cleanWarning = cleanFPs > 0
142
- ? `\nThis judge produced ${cleanFPs} false positives on CLEAN code. Well-written code using standard patterns exists. If the code follows established best practices, report ZERO ${prefix}- findings.`
143
+ ? `\nThis judge produced ${cleanFPs} false positive(s) on CLEAN code. If code uses standard patterns correctly (proper error handling, established libraries, framework conventions), report ZERO ${prefix}- findings. Clean, well-written code exists — do not manufacture findings.`
143
144
  : "";
144
- const amendment = `PRECISION OVERRIDE for ${judgeName} (${prefix}-): ` +
145
- `Empirical precision: ${pct(precision)} (${fpCount} FP in ${total} findings). ` +
146
- `SCOPE: Only report ${prefix}- findings for code that specifically involves ${domain}. ` +
147
- `EVIDENCE: Every ${prefix}- finding MUST cite exact line numbers and specific code patterns.` +
148
- categoryBlocklist +
149
- ruleWarning +
145
+ // IMPORTANT: Do NOT restrict the judge from detecting real issues in vulnerable code.
146
+ // Only add caution for clean-code patterns, not a blanket confidence floor.
147
+ const amendment = `PRECISION CALIBRATION for ${judgeName} (${prefix}-): ` +
148
+ `Empirical precision: ${pct(precision)} in recent benchmarks. ` +
149
+ `IMPORTANT: Continue detecting genuine ${domain} issues in vulnerable code — do NOT reduce sensitivity to real problems. ` +
150
+ `CALIBRATION: The false positives come from flagging well-written code that correctly uses established patterns. ` +
151
+ `Before reporting ${prefix}- findings, verify the code actually has a deficiency — not just a theoretical improvement opportunity.` +
150
152
  cleanWarning +
151
- ` When confidence is below 80%, OMIT the ${prefix}- finding.`;
153
+ nonCleanFPOnlyWarning;
152
154
  return {
153
155
  judgePrefix: prefix,
154
156
  amendment,
@@ -167,11 +169,12 @@ export function formatAmendmentSection(amendments) {
167
169
  if (amendments.length === 0)
168
170
  return "";
169
171
  const lines = [
170
- "## Precision Overrides — Based on Empirical Benchmark Data",
172
+ "## Precision Calibration — Based on Empirical Benchmark Data",
171
173
  "",
172
- "The following judges have been identified as having high false positive rates. " +
173
- "Apply EXTRA scrutiny before reporting findings with these prefixes. " +
174
- "False positives erode developer trust more than missed findings.",
174
+ "The following judges have historically produced false positives on clean code. " +
175
+ "Apply the calibration guidance below to avoid repeating these errors. " +
176
+ "IMPORTANT: These calibrations target CLEAN CODE false positives only — " +
177
+ "continue detecting genuine issues in vulnerable code with full sensitivity.",
175
178
  "",
176
179
  ];
177
180
  for (const a of amendments) {
@@ -153,7 +153,7 @@ export function parseLlmRuleIds(response) {
153
153
  // IDs mentioned in rationale text or findings tables of "clean" judge sections
154
154
  // from being counted as detections.
155
155
  const sections = response.split(/(?:^|\n)---\s*\n|(?=^## )/m);
156
- const zeroFindingsPattern = /\*?\*?(?:ZERO|zero|0|no)\s+findings?\*?\*?|(?:findings?|issues?)[\s:]*\*?\*?(?:none|0|zero)\*?\*?|no\s+(?:issues?|findings?|problems?|concerns?)\s+(?:found|detected|identified|reported)|report(?:ing)?\s+zero|Score\s*[|:]\s*\*?\*?100\s*\/?\s*100\*?\*?/i;
156
+ const zeroFindingsPattern = /(?:ZERO|zero|0|no) findings?|findings?[:\s]*(?:none|0|zero)|no (?:issues|findings|problems|concerns) (?:found|detected|identified|reported)|reporting? zero|Score[|: ]*100/i;
157
157
  for (const section of sections) {
158
158
  // If this section explicitly declares zero/no findings or a perfect score,
159
159
  // skip rule ID extraction — any rule IDs are explanatory references
@@ -504,7 +504,7 @@ function synthesizeHumanFocusGuide(findings, code, language) {
504
504
  });
505
505
  }
506
506
  // State machines / workflow
507
- const hasStateMachine = /state\s*[=:]\s*['"][^'"]+['"]|status\s*===?\s*['"]|transition|workflow|step.*next/i.test(code);
507
+ const hasStateMachine = /state\s*[=:]\s*['"][^'"]+['"]|status\s*===?\s*['"]|transition|workflow|step[\w\s]{0,20}next/i.test(code);
508
508
  if (hasStateMachine) {
509
509
  blindSpots.push({
510
510
  area: "State Management / Workflow Logic",
@@ -5,7 +5,10 @@ const SEVERITY_SET = new Set(["critical", "high", "medium", "low", "info"]);
5
5
  * Attempt to parse a JSON payload embedded in LLM output. Supports fenced code blocks and raw JSON.
6
6
  */
7
7
  function parseJsonBlock(text) {
8
- const fenceMatch = text.match(/```(?:json)?[ \t]*\n([\s\S]*?)\n[ \t]*```/i) ?? text.match(/```(?:json)?[ \t]*([\s\S]*?)```/i);
8
+ // Extract JSON from fenced code blocks limit search to first 50KB to prevent ReDoS on large input
9
+ const searchText = text.length > 50_000 ? text.slice(0, 50_000) : text;
10
+ const fenceMatch = searchText.match(/```(?:json)?\s*\n([\s\S]{0,20000}?)\n\s*```/i) ??
11
+ searchText.match(/```(?:json)?\s*([\s\S]{0,20000}?)```/i);
9
12
  if (fenceMatch) {
10
13
  try {
11
14
  return JSON.parse(fenceMatch[1]);
@@ -216,13 +216,9 @@ function compileExcludeRegexes(patterns) {
216
216
  if (!patterns || patterns.length === 0)
217
217
  return [];
218
218
  return patterns.map((pattern) => {
219
- try {
220
- return new RegExp(pattern, "i");
221
- }
222
- catch {
223
- // Invalid regex from user input — treat as literal string match
224
- return new RegExp(pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "i");
225
- }
219
+ // Always escape user input to prevent regex injection, then compile
220
+ const escaped = pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
221
+ return new RegExp(escaped, "i");
226
222
  });
227
223
  }
228
224
  function isLikelyNonProductionPath(path) {
@@ -25,7 +25,7 @@ export function parseSkillFrontmatter(raw) {
25
25
  i++;
26
26
  continue;
27
27
  }
28
- const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)[ \t]*:[ \t]*(.*)$/);
28
+ const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)[ \t]*:[ \t]*(.*?)$/s);
29
29
  if (!kv) {
30
30
  i++;
31
31
  continue;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kevinrabun/judges-cli",
3
- "version": "3.127.0",
3
+ "version": "3.127.2",
4
4
  "description": "CLI wrapper for the Judges code review toolkit.",
5
5
  "type": "module",
6
6
  "main": "dist/cli.js",