npm - @kevinrabun/judges-cli - Versions diffs - 3.127.0 → 3.127.2 - Mend

@kevinrabun/judges-cli 3.127.0 → 3.127.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +36 -4
package/dist/commands/llm-benchmark-optimizer.js +28 -25
package/dist/commands/llm-benchmark.js +1 -1
package/dist/evaluators/index.js +1 -1
package/dist/probabilistic/llm-response-validator.js +4 -1
package/dist/reports/public-repo-report.js +3 -7
package/dist/skill-loader.js +1 -1
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # @kevinrabun/judges-cli
-Standalone CLI package for Judges.
+Standalone CLI package for the [Judges Panel](https://github.com/KevinRabun/judges) — 45 specialized judges that evaluate code for security, quality, compliance, and 40 more dimensions.
 ## Install
@@ -11,14 +11,46 @@ npm install -g @kevinrabun/judges-cli
 ## Usage
 ```bash
+# Evaluate code
 judges eval src/app.ts
+judges eval src/ --format sarif --output report.sarif
+judges eval src/app.ts --judge cybersecurity
+judges eval src/app.ts --preset strict --fail-on-findings
+# List judges and regulatory frameworks
 judges list
-judges hook install
+judges list --frameworks
+# Auto-fix findings
+judges fix src/app.ts --apply
 # Agentic skills
 judges skill ai-code-review --file src/app.ts
 judges skill security-review --file src/api.ts --format json
-judges skills   # list available skills
+judges skills
+# Self-teaching
+judges codify-amendments          # bake benchmark amendments into judge files
+judges codify-amendments --dry-run
 ```
-Use `@kevinrabun/judges` when you need the MCP server or programmatic API.
+## Configuration
+Create a `.judgesrc.json` in your project root:
+```json
+{
+  "preset": "strict",
+  "regulatoryScope": ["GDPR", "PCI-DSS"],
+  "disabledJudges": ["accessibility"],
+  "failOnFindings": true
+}
+```
+See the [full configuration reference](https://github.com/KevinRabun/judges#configuration) for all options.
+## Packages
+- **`@kevinrabun/judges-cli`** — This package. Binary `judges` for CI/CD pipelines.
+- **`@kevinrabun/judges`** — Programmatic API + MCP server.
+- **VS Code extension** — [`kevinrabun.judges-panel`](https://marketplace.visualstudio.com/items?itemName=kevinrabun.judges-panel).

package/dist/commands/llm-benchmark-optimizer.js CHANGED Viewed

@@ -109,6 +109,7 @@ function generateAmendment(prefix, precision, fpCount, total, snapshot) {
     const domain = judge?.domain ?? "its domain";
     // Analyze what the FPs look like — which categories get falsely flagged
     const fpCategories = new Map();
+    const tpCategories = new Map();
     // Collect specific FP case IDs for pattern extraction
     const fpCaseExamples = [];
     for (const c of snapshot.cases) {
@@ -120,35 +121,36 @@ function generateAmendment(prefix, precision, fpCount, total, snapshot) {
                 }
             }
         }
+        // Also track where this judge produces TRUE positives
+        for (const det of c.detectedRuleIds) {
+            if (det.startsWith(prefix + "-") && !c.falsePositiveRuleIds.includes(det)) {
+                tpCategories.set(c.category, (tpCategories.get(c.category) ?? 0) + 1);
+            }
+        }
     }
-    const topFpCategories = [...fpCategories.entries()]
+    // Identify categories that are FP-only (no TPs) — safe to suppress
+    const fpOnlyCategories = [...fpCategories.entries()]
+        .filter(([cat]) => !tpCategories.has(cat))
         .sort((a, b) => b[1] - a[1])
         .slice(0, 5)
         .map(([cat]) => cat);
-    // Build specific anti-FP instructions based on observed patterns
-    const categoryBlocklist = topFpCategories.length > 0
-        ? `\nDo NOT report ${prefix}- findings on code in these categories: ${topFpCategories.join(", ")}. ` +
-            `These categories fall outside ${domain} and historically produce false positives.`
-        : "";
-    // Extract specific FP patterns for concrete guidance
-    const fpRuleIds = new Set(fpCaseExamples.map((e) => e.ruleId));
-    const specificRules = [...fpRuleIds].slice(0, 5).join(", ");
-    const ruleWarning = specificRules
-        ? `\nSpecific rule IDs with high FP rates: ${specificRules}. Require >=80% confidence with exact line citations before reporting these.`
-        : "";
-    // Identify if clean cases are a problem for this judge
+    // Build targeted anti-FP instructions — only suppress on clean/FP-only categories
     const cleanFPs = fpCaseExamples.filter((e) => e.category === "clean" || e.category.startsWith("ai-negative")).length;
+    const nonCleanFPOnlyWarning = fpOnlyCategories.length > 0
+        ? `\nHistorically produces false positives on: ${fpOnlyCategories.join(", ")}. Apply extra scrutiny on these categories — require concrete evidence before reporting.`
+        : "";
     const cleanWarning = cleanFPs > 0
-        ? `\nThis judge produced ${cleanFPs} false positives on CLEAN code. Well-written code using standard patterns exists. If the code follows established best practices, report ZERO ${prefix}- findings.`
+        ? `\nThis judge produced ${cleanFPs} false positive(s) on CLEAN code. If code uses standard patterns correctly (proper error handling, established libraries, framework conventions), report ZERO ${prefix}- findings. Clean, well-written code exists — do not manufacture findings.`
         : "";
-    const amendment = `PRECISION OVERRIDE for ${judgeName} (${prefix}-): ` +
-        `Empirical precision: ${pct(precision)} (${fpCount} FP in ${total} findings). ` +
-        `SCOPE: Only report ${prefix}- findings for code that specifically involves ${domain}. ` +
-        `EVIDENCE: Every ${prefix}- finding MUST cite exact line numbers and specific code patterns.` +
-        categoryBlocklist +
-        ruleWarning +
+    // IMPORTANT: Do NOT restrict the judge from detecting real issues in vulnerable code.
+    // Only add caution for clean-code patterns, not a blanket confidence floor.
+    const amendment = `PRECISION CALIBRATION for ${judgeName} (${prefix}-): ` +
+        `Empirical precision: ${pct(precision)} in recent benchmarks. ` +
+        `IMPORTANT: Continue detecting genuine ${domain} issues in vulnerable code — do NOT reduce sensitivity to real problems. ` +
+        `CALIBRATION: The false positives come from flagging well-written code that correctly uses established patterns. ` +
+        `Before reporting ${prefix}- findings, verify the code actually has a deficiency — not just a theoretical improvement opportunity.` +
         cleanWarning +
-        ` When confidence is below 80%, OMIT the ${prefix}- finding.`;
+        nonCleanFPOnlyWarning;
     return {
         judgePrefix: prefix,
         amendment,
@@ -167,11 +169,12 @@ export function formatAmendmentSection(amendments) {
     if (amendments.length === 0)
         return "";
     const lines = [
-        "## Precision Overrides — Based on Empirical Benchmark Data",
+        "## Precision Calibration — Based on Empirical Benchmark Data",
         "",
-        "The following judges have been identified as having high false positive rates. " +
-            "Apply EXTRA scrutiny before reporting findings with these prefixes. " +
-            "False positives erode developer trust more than missed findings.",
+        "The following judges have historically produced false positives on clean code. " +
+            "Apply the calibration guidance below to avoid repeating these errors. " +
+            "IMPORTANT: These calibrations target CLEAN CODE false positives only — " +
+            "continue detecting genuine issues in vulnerable code with full sensitivity.",
         "",
     ];
     for (const a of amendments) {

package/dist/commands/llm-benchmark.js CHANGED Viewed

@@ -153,7 +153,7 @@ export function parseLlmRuleIds(response) {
     // IDs mentioned in rationale text or findings tables of "clean" judge sections
     // from being counted as detections.
     const sections = response.split(/(?:^|\n)---\s*\n|(?=^## )/m);
-    const zeroFindingsPattern = /\*?\*?(?:ZERO|zero|0|no)\s+findings?\*?\*?|(?:findings?|issues?)[\s:]*\*?\*?(?:none|0|zero)\*?\*?|no\s+(?:issues?|findings?|problems?|concerns?)\s+(?:found|detected|identified|reported)|report(?:ing)?\s+zero|Score\s*[|:]\s*\*?\*?100\s*\/?\s*100\*?\*?/i;
+    const zeroFindingsPattern = /(?:ZERO|zero|0|no) findings?|findings?[:\s]*(?:none|0|zero)|no (?:issues|findings|problems|concerns) (?:found|detected|identified|reported)|reporting? zero|Score[|: ]*100/i;
     for (const section of sections) {
         // If this section explicitly declares zero/no findings or a perfect score,
         // skip rule ID extraction — any rule IDs are explanatory references

package/dist/evaluators/index.js CHANGED Viewed

@@ -504,7 +504,7 @@ function synthesizeHumanFocusGuide(findings, code, language) {
             });
         }
         // State machines / workflow
-        const hasStateMachine = /state\s*[=:]\s*['"][^'"]+['"]|status\s*===?\s*['"]|transition|workflow|step.*next/i.test(code);
+        const hasStateMachine = /state\s*[=:]\s*['"][^'"]+['"]|status\s*===?\s*['"]|transition|workflow|step[\w\s]{0,20}next/i.test(code);
         if (hasStateMachine) {
             blindSpots.push({
                 area: "State Management / Workflow Logic",

package/dist/probabilistic/llm-response-validator.js CHANGED Viewed

@@ -5,7 +5,10 @@ const SEVERITY_SET = new Set(["critical", "high", "medium", "low", "info"]);
  * Attempt to parse a JSON payload embedded in LLM output. Supports fenced code blocks and raw JSON.
  */
 function parseJsonBlock(text) {
-    const fenceMatch = text.match(/```(?:json)?[ \t]*\n([\s\S]*?)\n[ \t]*```/i) ?? text.match(/```(?:json)?[ \t]*([\s\S]*?)```/i);
+    // Extract JSON from fenced code blocks — limit search to first 50KB to prevent ReDoS on large input
+    const searchText = text.length > 50_000 ? text.slice(0, 50_000) : text;
+    const fenceMatch = searchText.match(/```(?:json)?\s*\n([\s\S]{0,20000}?)\n\s*```/i) ??
+        searchText.match(/```(?:json)?\s*([\s\S]{0,20000}?)```/i);
     if (fenceMatch) {
         try {
             return JSON.parse(fenceMatch[1]);

package/dist/reports/public-repo-report.js CHANGED Viewed

@@ -216,13 +216,9 @@ function compileExcludeRegexes(patterns) {
     if (!patterns || patterns.length === 0)
         return [];
     return patterns.map((pattern) => {
-        try {
-            return new RegExp(pattern, "i");
-        }
-        catch {
-            // Invalid regex from user input — treat as literal string match
-            return new RegExp(pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "i");
-        }
+        // Always escape user input to prevent regex injection, then compile
+        const escaped = pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+        return new RegExp(escaped, "i");
     });
 }
 function isLikelyNonProductionPath(path) {

package/dist/skill-loader.js CHANGED Viewed

@@ -25,7 +25,7 @@ export function parseSkillFrontmatter(raw) {
             i++;
             continue;
         }
-        const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)[ \t]*:[ \t]*(.*)$/);
+        const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)[ \t]*:[ \t]*(.*?)$/s);
         if (!kv) {
             i++;
             continue;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@kevinrabun/judges-cli",
-  "version": "3.127.0",
+  "version": "3.127.2",
   "description": "CLI wrapper for the Judges code review toolkit.",
   "type": "module",
   "main": "dist/cli.js",