@kevinrabun/judges-cli 3.124.5 → 3.126.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/agents/accessibility.judge.md +1 -1
  2. package/agents/agent-instructions.judge.md +1 -1
  3. package/agents/ai-code-safety.judge.md +10 -1
  4. package/agents/api-design.judge.md +1 -1
  5. package/agents/authentication.judge.md +1 -1
  6. package/agents/backwards-compatibility.judge.md +1 -1
  7. package/agents/caching.judge.md +1 -1
  8. package/agents/ci-cd.judge.md +1 -1
  9. package/agents/cloud-readiness.judge.md +1 -1
  10. package/agents/code-structure.judge.md +1 -1
  11. package/agents/compliance.judge.md +1 -1
  12. package/agents/concurrency.judge.md +1 -1
  13. package/agents/configuration-management.judge.md +1 -1
  14. package/agents/cost-effectiveness.judge.md +9 -1
  15. package/agents/cybersecurity.judge.md +1 -1
  16. package/agents/data-security.judge.md +1 -1
  17. package/agents/data-sovereignty.judge.md +1 -1
  18. package/agents/database.judge.md +1 -1
  19. package/agents/dependency-health.judge.md +1 -1
  20. package/agents/documentation.judge.md +1 -1
  21. package/agents/error-handling.judge.md +1 -1
  22. package/agents/ethics-bias.judge.md +1 -1
  23. package/agents/framework-safety.judge.md +9 -1
  24. package/agents/hallucination-detection.judge.md +1 -1
  25. package/agents/iac-security.judge.md +1 -1
  26. package/agents/intent-alignment.judge.md +1 -1
  27. package/agents/internationalization.judge.md +1 -1
  28. package/agents/logging-privacy.judge.md +1 -1
  29. package/agents/logic-review.judge.md +8 -0
  30. package/agents/maintainability.judge.md +10 -1
  31. package/agents/observability.judge.md +1 -1
  32. package/agents/performance.judge.md +1 -1
  33. package/agents/portability.judge.md +1 -1
  34. package/agents/rate-limiting.judge.md +1 -1
  35. package/agents/reliability.judge.md +1 -1
  36. package/agents/scalability.judge.md +1 -1
  37. package/agents/security.judge.md +1 -1
  38. package/agents/software-practices.judge.md +1 -1
  39. package/agents/testing.judge.md +1 -1
  40. package/agents/ux.judge.md +1 -1
  41. package/dist/api.d.ts +2 -1
  42. package/dist/api.js +2 -0
  43. package/dist/cli-formatters.js +38 -0
  44. package/dist/cli.js +27 -1
  45. package/dist/commands/llm-benchmark.js +18 -5
  46. package/dist/evaluators/index.js +163 -1
  47. package/dist/evaluators/shared.js +33 -0
  48. package/dist/judges/accessibility.js +1 -1
  49. package/dist/judges/agent-instructions.js +1 -1
  50. package/dist/judges/ai-code-safety.js +10 -1
  51. package/dist/judges/api-design.js +1 -1
  52. package/dist/judges/authentication.js +1 -1
  53. package/dist/judges/backwards-compatibility.js +1 -1
  54. package/dist/judges/caching.js +1 -1
  55. package/dist/judges/ci-cd.js +1 -1
  56. package/dist/judges/cloud-readiness.js +1 -1
  57. package/dist/judges/code-structure.js +1 -1
  58. package/dist/judges/compliance.js +1 -1
  59. package/dist/judges/concurrency.js +1 -1
  60. package/dist/judges/configuration-management.js +1 -1
  61. package/dist/judges/cost-effectiveness.js +9 -1
  62. package/dist/judges/cybersecurity.js +1 -1
  63. package/dist/judges/data-security.js +1 -1
  64. package/dist/judges/data-sovereignty.js +1 -1
  65. package/dist/judges/database.js +1 -1
  66. package/dist/judges/dependency-health.js +1 -1
  67. package/dist/judges/documentation.js +1 -1
  68. package/dist/judges/error-handling.js +1 -1
  69. package/dist/judges/ethics-bias.js +1 -1
  70. package/dist/judges/framework-safety.js +9 -1
  71. package/dist/judges/hallucination-detection.js +1 -1
  72. package/dist/judges/iac-security.js +1 -1
  73. package/dist/judges/intent-alignment.js +1 -1
  74. package/dist/judges/internationalization.js +1 -1
  75. package/dist/judges/logging-privacy.js +1 -1
  76. package/dist/judges/logic-review.js +9 -1
  77. package/dist/judges/maintainability.js +10 -1
  78. package/dist/judges/observability.js +1 -1
  79. package/dist/judges/performance.js +1 -1
  80. package/dist/judges/portability.js +1 -1
  81. package/dist/judges/rate-limiting.js +1 -1
  82. package/dist/judges/reliability.js +1 -1
  83. package/dist/judges/scalability.js +1 -1
  84. package/dist/judges/security.js +1 -1
  85. package/dist/judges/software-practices.js +1 -1
  86. package/dist/judges/testing.js +1 -1
  87. package/dist/judges/ux.js +1 -1
  88. package/dist/regulatory-scope.d.ts +27 -0
  89. package/dist/regulatory-scope.js +181 -0
  90. package/dist/tools/prompts.d.ts +1 -1
  91. package/dist/tools/prompts.js +3 -1
  92. package/dist/types.d.ts +87 -0
  93. package/package.json +1 -1
@@ -36,11 +36,20 @@ FALSE POSITIVE AVOIDANCE:
36
36
  - Do NOT flag configuration files, data files, or build scripts for code maintainability issues.
37
37
  - Only flag maintainability issues when you can cite specific code patterns (deep nesting, excessive coupling, duplicated logic) with exact line numbers.
38
38
 
39
+ CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
40
+ - Functions/methods have clear single responsibilities and reasonable length
41
+ - Naming is consistent and self-documenting
42
+ - No deep nesting (>3 levels) or excessive cyclomatic complexity
43
+ - No copy-pasted logic blocks
44
+ - No magic numbers in business logic (configuration constants are fine)
45
+ - Standard library and framework patterns used idiomatically
46
+ - Code reads top-to-bottom without requiring cross-referencing
47
+
39
48
  ADVERSARIAL MANDATE:
40
49
  - Your role is adversarial: assume the code is unmaintainable and actively hunt for problems. Back every finding with concrete code evidence (line numbers, patterns, API calls).
41
50
  - Never praise or compliment the code. Report only problems, risks, and deficiencies.
42
51
  - If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
43
- - Absence of findings does not mean the code is maintainable. It means your analysis reached its limits. State this explicitly.`,
52
+ - If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code \u2014 do not manufacture findings to fill the report.`,
44
53
  analyze: analyzeMaintainability,
45
54
  };
46
55
  defaultRegistry.register(maintainabilityJudge);
@@ -48,7 +48,7 @@ ADVERSARIAL MANDATE:
48
48
  - Your role is adversarial: assume the code is unobservable and will be impossible to debug in production. Actively hunt for monitoring gaps. Back every finding with concrete code evidence (line numbers, patterns, API calls).
49
49
  - Never praise or compliment the code. Report only problems, risks, and deficiencies.
50
50
  - If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
51
- - Absence of findings does not mean the code is observable. It means your analysis reached its limits. State this explicitly.`,
51
+ - If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code \u2014 do not manufacture findings to fill the report.`,
52
52
  analyze: analyzeObservability,
53
53
  };
54
54
  defaultRegistry.register(observabilityJudge);
@@ -40,7 +40,7 @@ ADVERSARIAL MANDATE:
40
40
  - Your role is adversarial: assume the code has performance problems and actively hunt for bottlenecks. Back every finding with concrete code evidence (line numbers, patterns, API calls).
41
41
  - Never praise or compliment the code. Report only problems, risks, and deficiencies.
42
42
  - If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
43
- - Absence of findings does not mean the code is performant. It means your analysis reached its limits. State this explicitly.`,
43
+ - If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code \u2014 do not manufacture findings to fill the report.`,
44
44
  analyze: analyzePerformance,
45
45
  };
46
46
  defaultRegistry.register(performanceJudge);
@@ -40,7 +40,7 @@ ADVERSARIAL MANDATE:
40
40
  - Your role is adversarial: assume the code is not portable and actively hunt for platform dependencies. Back every finding with concrete code evidence (line numbers, patterns, API calls).
41
41
  - Never praise or compliment the code. Report only problems, risks, and deficiencies.
42
42
  - If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
43
- - Absence of findings does not mean the code is portable. It means your analysis reached its limits. State this explicitly.`,
43
+ - If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code \u2014 do not manufacture findings to fill the report.`,
44
44
  analyze: analyzePortability,
45
45
  };
46
46
  defaultRegistry.register(portabilityJudge);
@@ -49,7 +49,7 @@ ADVERSARIAL MANDATE:
49
49
  - Your role is adversarial: assume rate limiting is absent or insufficient and actively hunt for problems. Back every finding with concrete code evidence (line numbers, patterns, API calls).
50
50
  - Never praise or compliment the code. Report only problems, risks, and deficiencies.
51
51
  - If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
52
- - Absence of findings does not mean rate limiting is adequate. It means your analysis reached its limits. State this explicitly.`,
52
+ - If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code \u2014 do not manufacture findings to fill the report.`,
53
53
  analyze: analyzeRateLimiting,
54
54
  };
55
55
  defaultRegistry.register(rateLimitingJudge);
@@ -51,7 +51,7 @@ ADVERSARIAL MANDATE:
51
51
  - Your role is adversarial: assume the code will fail in production and actively hunt for reliability gaps. Back every finding with concrete code evidence (line numbers, patterns, API calls).
52
52
  - Never praise or compliment the code. Report only problems, risks, and deficiencies.
53
53
  - If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
54
- - Absence of findings does not mean the code is reliable. It means your analysis reached its limits. State this explicitly.`,
54
+ - If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code \u2014 do not manufacture findings to fill the report.`,
55
55
  analyze: analyzeReliability,
56
56
  };
57
57
  defaultRegistry.register(reliabilityJudge);
@@ -46,7 +46,7 @@ ADVERSARIAL MANDATE:
46
46
  - Your role is adversarial: assume the code will not scale and actively hunt for bottlenecks. Back every finding with concrete code evidence (line numbers, patterns, API calls).
47
47
  - Never praise or compliment the code. Report only problems, risks, and deficiencies.
48
48
  - If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
49
- - Absence of findings does not mean the code will scale. It means your analysis reached its limits. State this explicitly.`,
49
+ - If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code \u2014 do not manufacture findings to fill the report.`,
50
50
  analyze: analyzeScalability,
51
51
  };
52
52
  defaultRegistry.register(scalabilityJudge);
@@ -58,7 +58,7 @@ ADVERSARIAL MANDATE:
58
58
  - Your role is adversarial: assume the code has security vulnerabilities and actively hunt for them. Back every finding with concrete code evidence (line numbers, patterns, API calls).
59
59
  - Never praise or compliment the code. Report only problems, risks, and deficiencies.
60
60
  - If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
61
- - Absence of findings does not mean the code is secure. It means your analysis reached its limits. State this explicitly.`,
61
+ - If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code \u2014 do not manufacture findings to fill the report.`,
62
62
  analyze: analyzeSecurity,
63
63
  };
64
64
  defaultRegistry.register(securityJudge);
@@ -50,7 +50,7 @@ ADVERSARIAL MANDATE:
50
50
  - Your role is adversarial: assume the code has engineering quality problems and actively hunt for them. Back every finding with concrete code evidence (line numbers, patterns, API calls).
51
51
  - Never praise or compliment the code. Report only problems, risks, and deficiencies.
52
52
  - If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
53
- - Absence of findings does not mean the code follows best practices. It means your analysis reached its limits. State this explicitly.`,
53
+ - If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code \u2014 do not manufacture findings to fill the report.`,
54
54
  analyze: analyzeSoftwarePractices,
55
55
  };
56
56
  defaultRegistry.register(softwarePracticesJudge);
@@ -48,7 +48,7 @@ ADVERSARIAL MANDATE:
48
48
  - Your role is adversarial: assume the test coverage is insufficient and actively hunt for gaps. Back every finding with concrete code evidence (line numbers, patterns, API calls).
49
49
  - Never praise or compliment the code. Report only problems, risks, and deficiencies.
50
50
  - If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
51
- - Absence of findings does not mean the code is well-tested. It means your analysis reached its limits. State this explicitly.`,
51
+ - If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code \u2014 do not manufacture findings to fill the report.`,
52
52
  analyze: analyzeTesting,
53
53
  };
54
54
  defaultRegistry.register(testingJudge);
package/dist/judges/ux.js CHANGED
@@ -40,7 +40,7 @@ ADVERSARIAL MANDATE:
40
40
  - Your role is adversarial: assume the user experience is poor and actively hunt for problems. Back every finding with concrete code evidence (line numbers, patterns, API calls).
41
41
  - Never praise or compliment the code. Report only problems, risks, and deficiencies.
42
42
  - If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
43
- - Absence of findings does not mean the UX is good. It means your analysis reached its limits. State this explicitly.`,
43
+ - If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code \u2014 do not manufacture findings to fill the report.`,
44
44
  analyze: analyzeUx,
45
45
  };
46
46
  defaultRegistry.register(uxJudge);
@@ -0,0 +1,27 @@
1
+ /**
2
+ * Regulatory Scope — Framework-aware finding filtering.
3
+ *
4
+ * When `regulatoryScope` is set in `.judgesrc`, findings whose `reference`
5
+ * field cites ONLY out-of-scope frameworks are suppressed. Findings that
6
+ * cite at least one in-scope framework (or have no regulatory reference)
7
+ * are kept.
8
+ */
9
+ import type { Finding } from "./types.js";
10
+ /** Look up supported framework IDs for listing/validation. */
11
+ export declare function getSupportedFrameworks(): Array<{
12
+ id: string;
13
+ description: string;
14
+ }>;
15
+ /**
16
+ * Filter findings based on `regulatoryScope`. Findings that cite ONLY
17
+ * out-of-scope frameworks are suppressed. Findings with no regulatory
18
+ * reference or with at least one in-scope framework are kept.
19
+ *
20
+ * @param findings - All findings from the tribunal
21
+ * @param scope - Array of framework IDs (e.g. ["GDPR", "PCI-DSS"])
22
+ * @returns Object with kept findings and count of suppressed findings
23
+ */
24
+ export declare function filterByRegulatoryScope(findings: Finding[], scope: string[]): {
25
+ findings: Finding[];
26
+ suppressed: number;
27
+ };
@@ -0,0 +1,181 @@
1
+ /**
2
+ * Regulatory Scope — Framework-aware finding filtering.
3
+ *
4
+ * When `regulatoryScope` is set in `.judgesrc`, findings whose `reference`
5
+ * field cites ONLY out-of-scope frameworks are suppressed. Findings that
6
+ * cite at least one in-scope framework (or have no regulatory reference)
7
+ * are kept.
8
+ */
9
+ const FRAMEWORKS = [
10
+ {
11
+ id: "GDPR",
12
+ aliases: [
13
+ "gdpr",
14
+ "general data protection",
15
+ "article 5",
16
+ "article 6",
17
+ "article 8",
18
+ "article 17",
19
+ "article 22",
20
+ "article 32",
21
+ "chapter v",
22
+ "data protection regulation",
23
+ ],
24
+ description: "EU General Data Protection Regulation",
25
+ },
26
+ {
27
+ id: "CCPA",
28
+ aliases: ["ccpa", "california consumer privacy", "cpra", "right to delete"],
29
+ description: "California Consumer Privacy Act",
30
+ },
31
+ {
32
+ id: "HIPAA",
33
+ aliases: [
34
+ "hipaa",
35
+ "health insurance portability",
36
+ "phi",
37
+ "protected health information",
38
+ "45 cfr",
39
+ "security rule",
40
+ "minimum necessary",
41
+ ],
42
+ description: "Health Insurance Portability and Accountability Act",
43
+ },
44
+ {
45
+ id: "PCI-DSS",
46
+ aliases: ["pci", "pci dss", "pci-dss", "payment card", "cardholder data", "requirement 3"],
47
+ description: "Payment Card Industry Data Security Standard",
48
+ },
49
+ {
50
+ id: "SOC2",
51
+ aliases: ["soc 2", "soc2", "trust service", "cc6", "cc7"],
52
+ description: "SOC 2 Trust Service Criteria",
53
+ },
54
+ {
55
+ id: "SOX",
56
+ aliases: ["sox", "sarbanes-oxley", "sarbanes oxley"],
57
+ description: "Sarbanes-Oxley Act",
58
+ },
59
+ {
60
+ id: "COPPA",
61
+ aliases: ["coppa", "children.*online privacy", "age appropriate design"],
62
+ description: "Children's Online Privacy Protection Act",
63
+ },
64
+ {
65
+ id: "FERPA",
66
+ aliases: ["ferpa", "family educational rights"],
67
+ description: "Family Educational Rights and Privacy Act",
68
+ },
69
+ {
70
+ id: "FedRAMP",
71
+ aliases: ["fedramp", "fed ramp", "federal risk"],
72
+ description: "Federal Risk and Authorization Management Program",
73
+ },
74
+ {
75
+ id: "NIST",
76
+ aliases: ["nist", "sp 800", "800-53", "800-63", "800-131", "800-122", "ssdf"],
77
+ description: "NIST Cybersecurity Framework & Special Publications",
78
+ },
79
+ {
80
+ id: "ISO27001",
81
+ aliases: ["iso 27001", "iso27001", "iso/iec 27001"],
82
+ description: "ISO/IEC 27001 Information Security Management",
83
+ },
84
+ {
85
+ id: "ePrivacy",
86
+ aliases: ["eprivacy", "e-privacy", "cookie.*directive", "eprivacy directive"],
87
+ description: "EU ePrivacy Directive",
88
+ },
89
+ {
90
+ id: "DORA",
91
+ aliases: ["dora", "digital operational resilience"],
92
+ description: "Digital Operational Resilience Act",
93
+ },
94
+ {
95
+ id: "NIS2",
96
+ aliases: ["nis2", "nis 2", "network.*information.*security"],
97
+ description: "Network and Information Security Directive 2",
98
+ },
99
+ {
100
+ id: "EU-AI-Act",
101
+ aliases: ["eu ai act", "ai act", "artificial intelligence act"],
102
+ description: "EU Artificial Intelligence Act",
103
+ },
104
+ {
105
+ id: "LGPD",
106
+ aliases: ["lgpd", "lei geral.*prote"],
107
+ description: "Brazil General Data Protection Law",
108
+ },
109
+ {
110
+ id: "PIPEDA",
111
+ aliases: ["pipeda", "personal information protection.*electronic"],
112
+ description: "Canada Personal Information Protection and Electronic Documents Act",
113
+ },
114
+ ];
115
+ /** Look up supported framework IDs for listing/validation. */
116
+ export function getSupportedFrameworks() {
117
+ return FRAMEWORKS.map((f) => ({ id: f.id, description: f.description }));
118
+ }
119
+ // ─── Framework Detection in Finding References ──────────────────────────────
120
+ /**
121
+ * Detect which regulatory frameworks a finding references.
122
+ * Checks the `reference` and `description` fields for framework aliases.
123
+ */
124
+ function detectFrameworks(finding) {
125
+ const detected = new Set();
126
+ const text = `${finding.reference ?? ""} ${finding.description ?? ""}`.toLowerCase();
127
+ if (!text.trim())
128
+ return detected;
129
+ for (const fw of FRAMEWORKS) {
130
+ for (const alias of fw.aliases) {
131
+ if (text.includes(alias.toLowerCase())) {
132
+ detected.add(fw.id);
133
+ break;
134
+ }
135
+ }
136
+ }
137
+ return detected;
138
+ }
139
+ // ─── Regulatory Scope Filter ────────────────────────────────────────────────
140
+ /**
141
+ * Filter findings based on `regulatoryScope`. Findings that cite ONLY
142
+ * out-of-scope frameworks are suppressed. Findings with no regulatory
143
+ * reference or with at least one in-scope framework are kept.
144
+ *
145
+ * @param findings - All findings from the tribunal
146
+ * @param scope - Array of framework IDs (e.g. ["GDPR", "PCI-DSS"])
147
+ * @returns Object with kept findings and count of suppressed findings
148
+ */
149
+ export function filterByRegulatoryScope(findings, scope) {
150
+ if (!scope || scope.length === 0) {
151
+ return { findings, suppressed: 0 };
152
+ }
153
+ const scopeSet = new Set(scope.map((s) => s.toUpperCase()));
154
+ // Normalize framework IDs (e.g. "pci-dss" → "PCI-DSS")
155
+ const normalizedScope = new Set();
156
+ for (const id of scopeSet) {
157
+ const fw = FRAMEWORKS.find((f) => f.id.toUpperCase() === id);
158
+ if (fw)
159
+ normalizedScope.add(fw.id);
160
+ }
161
+ let suppressed = 0;
162
+ const kept = [];
163
+ for (const finding of findings) {
164
+ const cited = detectFrameworks(finding);
165
+ if (cited.size === 0) {
166
+ // No regulatory reference — keep (it's a general code quality finding)
167
+ kept.push(finding);
168
+ }
169
+ else {
170
+ // Has regulatory reference — keep only if at least one is in scope
171
+ const hasInScope = [...cited].some((id) => normalizedScope.has(id));
172
+ if (hasInScope) {
173
+ kept.push(finding);
174
+ }
175
+ else {
176
+ suppressed++;
177
+ }
178
+ }
179
+ }
180
+ return { findings: kept, suppressed };
181
+ }
@@ -4,7 +4,7 @@ export declare const SHARED_ADVERSARIAL_MANDATE = "ADVERSARIAL MANDATE (applies
4
4
  /** Precision override — ensures evidence-based findings. */
5
5
  export declare const PRECISION_MANDATE = "PRECISION MANDATE (this section OVERRIDES the adversarial mandate whenever they conflict):\n- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence MUST be discarded \u2014 no exceptions.\n- Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.\n- Speculative, hypothetical, or \"just in case\" findings erode developer trust. Only flag issues you are confident exist in the actual code.\n- Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.\n- If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.\n- Clean, well-structured code exists. Acknowledge it by not forcing false issues.\n- RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.\n- SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.\n- CONFIDENCE THRESHOLD: Only report findings where you are highly confident (\u226580%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.\n- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.\n\nCOMMON FALSE POSITIVE PATTERNS (do NOT report these):\n- ERR: Do not flag error handling as inadequate when try/catch blocks, validation, or error middleware are present. Missing error handling in a utility function that is clearly called within a guarded context is NOT a finding.\n- LOGIC: Do not flag logic issues for standard patterns (early returns, guard clauses, switch/case with default). Only flag logic errors when you can demonstrate a concrete input that produces an incorrect output.\n- MAINT: Do not flag maintainability concerns for code that follows the language's established idioms. Complexity or length alone is NOT a finding unless it introduces a concrete maintenance burden.\n- SEC: Do not flag security issues when established security libraries (helmet, cors, bcrypt, parameterized queries) are correctly used. \"Could be stronger\" is NOT a vulnerability.\n- STRUCT: Do not flag code structure preferences (file organization, naming conventions) unless they create a concrete deficiency like circular dependencies or unreachable code.";
6
6
  /** Clean code gate — explicit instructions when code quality is high. */
7
- export declare const CLEAN_CODE_GATE = "CLEAN CODE GATE (applies AFTER individual judge evaluation):\n- Before reporting findings, assess the OVERALL quality of the code. If the code follows established conventions, uses appropriate patterns, handles errors, and has no concrete vulnerabilities or deficiencies, the expected output is ZERO findings across ALL judges.\n- Do NOT report stylistic preferences, alternative approaches, or \"nice to have\" improvements as findings. These are opinions, not defects.\n- Do NOT report findings about missing functionality that is likely in other files (tests, configs, middleware, error handlers, logging setup).\n- Do NOT report theoretical risks that require assumptions about the runtime environment, deployment configuration, or code outside the provided snippet.\n- SELF-CHECK before finalizing: For each finding, ask \"Would a senior engineer reviewing this code in a PR agree this must be fixed before merging?\" If the answer is not a clear YES, discard the finding.\n- The goal is to match what a thoughtful, experienced human reviewer would flag \u2014 not to demonstrate comprehensive knowledge of every possible concern.";
7
+ export declare const CLEAN_CODE_GATE = "CLEAN CODE GATE (applies AFTER individual judge evaluation):\n- Before reporting findings, assess the OVERALL quality of the code. If the code follows established conventions, uses appropriate patterns, handles errors, and has no concrete vulnerabilities or deficiencies, the expected output is ZERO findings across ALL judges.\n- Do NOT report stylistic preferences, alternative approaches, or \"nice to have\" improvements as findings. These are opinions, not defects.\n- Do NOT report findings about missing functionality that is likely in other files (tests, configs, middleware, error handlers, logging setup).\n- Do NOT report theoretical risks that require assumptions about the runtime environment, deployment configuration, or code outside the provided snippet.\n- SELF-CHECK before finalizing: For each finding, ask \"Would a senior engineer reviewing this code in a PR agree this must be fixed before merging?\" If the answer is not a clear YES, discard the finding.\n- The goal is to match what a thoughtful, experienced human reviewer would flag \u2014 not to demonstrate comprehensive knowledge of every possible concern.\n- SINGLE-FILE LIMITATION: You are reviewing a code snippet, not a complete project. Missing tests, missing docs, missing middleware, missing configs, missing CI/CD, missing logging setup \u2014 these are EXPECTED in a single-file review. Only flag what is WRONG in the code present, not what is ABSENT from the project.\n- FINAL GATE: If your evaluation produces findings for a code snippet that uses established libraries correctly, has proper error handling, follows language idioms, and contains no security vulnerabilities \u2014 your findings are almost certainly false positives. Discard them and report ZERO findings.";
8
8
  /**
9
9
  * Extract only the unique evaluation criteria from a judge's systemPrompt,
10
10
  * stripping the persona introduction line, the ADVERSARIAL MANDATE block,
@@ -44,7 +44,9 @@ export const CLEAN_CODE_GATE = `CLEAN CODE GATE (applies AFTER individual judge
44
44
  - Do NOT report findings about missing functionality that is likely in other files (tests, configs, middleware, error handlers, logging setup).
45
45
  - Do NOT report theoretical risks that require assumptions about the runtime environment, deployment configuration, or code outside the provided snippet.
46
46
  - SELF-CHECK before finalizing: For each finding, ask "Would a senior engineer reviewing this code in a PR agree this must be fixed before merging?" If the answer is not a clear YES, discard the finding.
47
- - The goal is to match what a thoughtful, experienced human reviewer would flag — not to demonstrate comprehensive knowledge of every possible concern.`;
47
+ - The goal is to match what a thoughtful, experienced human reviewer would flag — not to demonstrate comprehensive knowledge of every possible concern.
48
+ - SINGLE-FILE LIMITATION: You are reviewing a code snippet, not a complete project. Missing tests, missing docs, missing middleware, missing configs, missing CI/CD, missing logging setup — these are EXPECTED in a single-file review. Only flag what is WRONG in the code present, not what is ABSENT from the project.
49
+ - FINAL GATE: If your evaluation produces findings for a code snippet that uses established libraries correctly, has proper error handling, follows language idioms, and contains no security vulnerabilities — your findings are almost certainly false positives. Discard them and report ZERO findings.`;
48
50
  // ─── Criteria Extraction ─────────────────────────────────────────────────────
49
51
  /**
50
52
  * Extract only the unique evaluation criteria from a judge's systemPrompt,
package/dist/types.d.ts CHANGED
@@ -313,6 +313,45 @@ export interface JudgesConfig {
313
313
  url?: string;
314
314
  headers?: Record<string, string>;
315
315
  };
316
+ /**
317
+ * Regulatory frameworks in scope for this project. When set, findings that
318
+ * cite ONLY out-of-scope frameworks are suppressed, and in-scope findings
319
+ * are elevated to ensure visibility.
320
+ *
321
+ * If not set, all regulatory findings are reported (no filtering).
322
+ *
323
+ * Supported values: "GDPR", "CCPA", "HIPAA", "PCI-DSS", "SOC2", "SOX",
324
+ * "COPPA", "FERPA", "FedRAMP", "NIST", "ISO27001", "ePrivacy", "DORA",
325
+ * "NIS2", "EU-AI-Act", "LGPD", "PIPEDA"
326
+ *
327
+ * Example:
328
+ * ```json
329
+ * { "regulatoryScope": ["GDPR", "PCI-DSS", "SOC2"] }
330
+ * ```
331
+ */
332
+ regulatoryScope?: string[];
333
+ /**
334
+ * Consensus suppression threshold (0–1). When set, if at least this
335
+ * fraction of judges report zero findings for a file, findings from
336
+ * the remaining minority judges are suppressed as outliers.
337
+ *
338
+ * This reduces false positives from judges that are structurally prone
339
+ * to over-flagging clean code. A value of 0.7 means "if 70% of judges
340
+ * agree the code is clean, suppress the other 30%."
341
+ *
342
+ * Default: not set (no consensus suppression).
343
+ *
344
+ * Recommended values:
345
+ * - `0.7` — moderate: suppresses when most judges agree (good for CI)
346
+ * - `0.8` — conservative: only suppresses with strong consensus
347
+ * - `0.6` — aggressive: suppresses with slight majority
348
+ *
349
+ * Example:
350
+ * ```json
351
+ * { "consensusThreshold": 0.7 }
352
+ * ```
353
+ */
354
+ consensusThreshold?: number;
316
355
  }
317
356
  /**
318
357
  * A user-defined pattern-based rule for business logic validation.
@@ -613,6 +652,48 @@ export interface ReviewDecision {
613
652
  /** Top blocking issues (up to 3 critical/high findings) */
614
653
  blockingIssues: string[];
615
654
  }
655
+ /**
656
+ * A finding categorized for the human focus guide.
657
+ */
658
+ export interface FocusItem {
659
+ /** Rule ID (e.g. "SEC-001") */
660
+ ruleId: string;
661
+ /** Short title */
662
+ title: string;
663
+ /** Severity level */
664
+ severity: Severity;
665
+ /** Confidence score (0-1) */
666
+ confidence: number;
667
+ /** Line numbers if available */
668
+ lineNumbers?: number[];
669
+ /** Why this item is in its bucket */
670
+ reason: string;
671
+ }
672
+ /**
673
+ * An area the automated analysis could not evaluate — requires human judgment.
674
+ */
675
+ export interface BlindSpot {
676
+ /** Category label (e.g. "Business Logic", "Architectural Fit") */
677
+ area: string;
678
+ /** Description of what the reviewer should look for */
679
+ guidance: string;
680
+ /** Optional: specific lines or patterns that triggered this recommendation */
681
+ triggers?: string[];
682
+ }
683
+ /**
684
+ * Human Focus Guide — directs human reviewers to the areas where their
685
+ * attention adds the most value beyond what automated analysis provides.
686
+ */
687
+ export interface HumanFocusGuide {
688
+ /** High-confidence, evidence-backed findings the reviewer can trust */
689
+ trust: FocusItem[];
690
+ /** Lower-confidence or absence-based findings that need human verification */
691
+ verify: FocusItem[];
692
+ /** Areas the automated analysis cannot evaluate — human judgment required */
693
+ blindSpots: BlindSpot[];
694
+ /** One-paragraph summary for the reviewer */
695
+ summary: string;
696
+ }
616
697
  /**
617
698
  * The combined result from the full tribunal panel.
618
699
  */
@@ -651,6 +732,12 @@ export interface TribunalVerdict {
651
732
  * act as a primary code reviewer rather than just a warning list.
652
733
  */
653
734
  reviewDecision?: ReviewDecision;
735
+ /**
736
+ * Human Focus Guide — directs human reviewers to the areas where their
737
+ * attention adds the most value beyond what automated analysis provides.
738
+ * Categorizes findings into trust/verify/blind-spots buckets.
739
+ */
740
+ humanFocusGuide?: HumanFocusGuide;
654
741
  /**
655
742
  * AI model detection escalation. Present when the model-fingerprint judge
656
743
  * detects AI-generated code patterns (MFPR-* rules). Downstream consumers
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kevinrabun/judges-cli",
3
- "version": "3.124.5",
3
+ "version": "3.126.0",
4
4
  "description": "CLI wrapper for the Judges code review toolkit.",
5
5
  "type": "module",
6
6
  "main": "dist/cli.js",