@kevinrabun/judges 3.115.4 → 3.117.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. package/agents/accessibility.judge.md +7 -0
  2. package/agents/agent-instructions.judge.md +7 -0
  3. package/agents/ai-code-safety.judge.md +7 -0
  4. package/agents/api-contract.judge.md +7 -0
  5. package/agents/api-design.judge.md +7 -0
  6. package/agents/authentication.judge.md +7 -0
  7. package/agents/backwards-compatibility.judge.md +7 -0
  8. package/agents/caching.judge.md +7 -0
  9. package/agents/ci-cd.judge.md +7 -0
  10. package/agents/cloud-readiness.judge.md +7 -0
  11. package/agents/concurrency.judge.md +7 -0
  12. package/agents/configuration-management.judge.md +7 -0
  13. package/agents/cybersecurity.judge.md +7 -0
  14. package/agents/data-security.judge.md +7 -0
  15. package/agents/dependency-health.judge.md +7 -0
  16. package/agents/documentation.judge.md +7 -0
  17. package/agents/error-handling.judge.md +7 -0
  18. package/agents/ethics-bias.judge.md +7 -0
  19. package/agents/false-positive-review.judge.md +12 -0
  20. package/agents/framework-safety.judge.md +7 -0
  21. package/agents/hallucination-detection.judge.md +13 -0
  22. package/agents/iac-security.judge.md +7 -0
  23. package/agents/intent-alignment.judge.md +13 -0
  24. package/agents/logging-privacy.judge.md +7 -0
  25. package/agents/maintainability.judge.md +7 -0
  26. package/agents/multi-turn-coherence.judge.md +7 -0
  27. package/agents/observability.judge.md +7 -0
  28. package/agents/portability.judge.md +7 -0
  29. package/agents/rate-limiting.judge.md +7 -0
  30. package/agents/reliability.judge.md +7 -0
  31. package/agents/security.judge.md +13 -0
  32. package/agents/testing.judge.md +7 -0
  33. package/agents/ux.judge.md +7 -0
  34. package/dist/a2a-protocol.d.ts +136 -0
  35. package/dist/a2a-protocol.js +218 -0
  36. package/dist/api.d.ts +21 -3
  37. package/dist/api.js +21 -1
  38. package/dist/audit-trail.d.ts +245 -0
  39. package/dist/audit-trail.js +257 -0
  40. package/dist/commands/benchmark-advanced.js +51 -51
  41. package/dist/commands/benchmark-ai-agents.js +16 -16
  42. package/dist/commands/benchmark-compliance-ethics.js +12 -12
  43. package/dist/commands/benchmark-expanded-2.js +2 -2
  44. package/dist/commands/benchmark-expanded.js +2 -2
  45. package/dist/commands/benchmark-infrastructure.js +12 -12
  46. package/dist/commands/benchmark-languages.js +11 -11
  47. package/dist/commands/benchmark-quality-ops.js +7 -7
  48. package/dist/commands/benchmark-security-deep.js +9 -9
  49. package/dist/commands/benchmark.js +1 -1
  50. package/dist/commands/llm-benchmark-optimizer.d.ts +78 -0
  51. package/dist/commands/llm-benchmark-optimizer.js +241 -0
  52. package/dist/commands/llm-benchmark.d.ts +4 -2
  53. package/dist/commands/llm-benchmark.js +40 -12
  54. package/dist/escalation.d.ts +100 -0
  55. package/dist/escalation.js +292 -0
  56. package/dist/evaluation-session.d.ts +74 -0
  57. package/dist/evaluation-session.js +152 -0
  58. package/dist/evaluators/index.d.ts +23 -1
  59. package/dist/evaluators/index.js +192 -3
  60. package/dist/evaluators/judge-selector.d.ts +19 -0
  61. package/dist/evaluators/judge-selector.js +141 -0
  62. package/dist/evaluators/recall-boost.d.ts +27 -0
  63. package/dist/evaluators/recall-boost.js +409 -0
  64. package/dist/feedback-loop.d.ts +62 -0
  65. package/dist/feedback-loop.js +179 -0
  66. package/dist/index.js +2 -0
  67. package/dist/judges/accessibility.js +7 -0
  68. package/dist/judges/agent-instructions.js +7 -0
  69. package/dist/judges/ai-code-safety.js +7 -0
  70. package/dist/judges/api-contract.js +7 -0
  71. package/dist/judges/api-design.js +7 -0
  72. package/dist/judges/authentication.js +7 -0
  73. package/dist/judges/backwards-compatibility.js +7 -0
  74. package/dist/judges/caching.js +7 -0
  75. package/dist/judges/ci-cd.js +7 -0
  76. package/dist/judges/cloud-readiness.js +7 -0
  77. package/dist/judges/concurrency.js +7 -0
  78. package/dist/judges/configuration-management.js +7 -0
  79. package/dist/judges/cybersecurity.js +7 -0
  80. package/dist/judges/data-security.js +7 -0
  81. package/dist/judges/dependency-health.js +7 -0
  82. package/dist/judges/documentation.js +7 -0
  83. package/dist/judges/error-handling.js +7 -0
  84. package/dist/judges/ethics-bias.js +7 -0
  85. package/dist/judges/false-positive-review.js +13 -1
  86. package/dist/judges/framework-safety.js +7 -0
  87. package/dist/judges/hallucination-detection.js +14 -1
  88. package/dist/judges/iac-security.js +7 -0
  89. package/dist/judges/intent-alignment.js +14 -1
  90. package/dist/judges/logging-privacy.js +7 -0
  91. package/dist/judges/maintainability.js +7 -0
  92. package/dist/judges/multi-turn-coherence.js +7 -0
  93. package/dist/judges/observability.js +7 -0
  94. package/dist/judges/portability.js +7 -0
  95. package/dist/judges/rate-limiting.js +7 -0
  96. package/dist/judges/reliability.js +7 -0
  97. package/dist/judges/security.js +14 -1
  98. package/dist/judges/testing.js +7 -0
  99. package/dist/judges/ux.js +7 -0
  100. package/dist/review-conversation.d.ts +87 -0
  101. package/dist/review-conversation.js +307 -0
  102. package/dist/sast-integration.d.ts +112 -0
  103. package/dist/sast-integration.js +215 -0
  104. package/dist/tools/register-evaluation.js +208 -8
  105. package/dist/tools/register-fix.js +24 -1
  106. package/dist/tools/register-resources.d.ts +6 -0
  107. package/dist/tools/register-resources.js +177 -0
  108. package/dist/tools/register-review.js +26 -1
  109. package/dist/tools/register-workflow.js +384 -11
  110. package/dist/tools/validation.d.ts +13 -0
  111. package/dist/tools/validation.js +77 -0
  112. package/dist/types.d.ts +122 -0
  113. package/package.json +25 -12
  114. package/server.json +2 -2
@@ -19,6 +19,10 @@ import { loadFeedbackStore } from "../commands/feedback.js";
19
19
  import { CROSS_FILE_SECURITY_CATEGORIES } from "./project.js";
20
20
  import { applyTriageFeedback, loadFindingStore } from "../finding-lifecycle.js";
21
21
  import { enrichWithSecurityIds } from "../security-ids.js";
22
+ import { selectJudges } from "./judge-selector.js";
23
+ import { getGlobalSession } from "../evaluation-session.js";
24
+ import { evaluateEscalations, enhanceReviewWithEscalations } from "../escalation.js";
25
+ import { applyRecallBoost } from "./recall-boost.js";
22
26
  // ── AST-aware post-processing ───────────────────────────────────────────────
23
27
  // ── Module-level caches for AST/taint results ───────────────────────────────
24
28
  const astStructureCache = new LRUCache(256);
@@ -390,7 +394,19 @@ function resolveJudgeSet(options) {
390
394
  const disabled = new Set(options.config.disabledJudges);
391
395
  judges = judges.filter((j) => !disabled.has(j.id));
392
396
  }
393
- return judges;
397
+ // Adaptive judge selection — skip irrelevant judges based on file context
398
+ if (options?.adaptiveSelection && options.filePath) {
399
+ const fileCategory = classifyFile("", options.filePath.split(".").pop() ?? "", options.filePath);
400
+ const ctx = {
401
+ language: options.filePath.split(".").pop() ?? "unknown",
402
+ fileCategory,
403
+ filePath: options.filePath,
404
+ projectMode: options.projectMode,
405
+ };
406
+ const result = selectJudges(judges, ctx);
407
+ return { judges: result.selected, skipped: result.skipped };
408
+ }
409
+ return { judges };
394
410
  }
395
411
  /**
396
412
  * Check whether an absence-based finding is mitigated by a pre-scanned
@@ -419,6 +435,16 @@ export function evaluateWithJudge(judge, code, language, context, options) {
419
435
  : undefined;
420
436
  findings.push(...judge.analyze(code, language, analyzeCtx));
421
437
  }
438
+ // ── Recall boost: supplementary patterns for weak-recall categories ──
439
+ const boostResult = applyRecallBoost(code, language);
440
+ if (boostResult.findings.length > 0) {
441
+ // Deduplicate: only add boost findings whose ruleId isn't already present
442
+ for (const bf of boostResult.findings) {
443
+ if (!findings.some((f) => f.ruleId === bf.ruleId)) {
444
+ findings.push(bf);
445
+ }
446
+ }
447
+ }
422
448
  // ── Absence gating ──
423
449
  // Absence-based findings ("no rate limiting", "no monitoring", etc.) are
424
450
  // project-level concerns that cannot be accurately assessed from a single
@@ -649,7 +675,7 @@ export function evaluateWithTribunal(code, language, context, options) {
649
675
  ...(astResult ? { _astCache: astResult } : {}),
650
676
  ...(taintResult ? { _taintFlows: taintResult } : {}),
651
677
  };
652
- const judges = resolveJudgeSet(enrichedOptions);
678
+ const { judges, skipped: skippedJudges } = resolveJudgeSet(enrichedOptions);
653
679
  const tribunalStart = performance.now();
654
680
  const evaluations = judges.map((judge) => {
655
681
  const start = performance.now();
@@ -776,7 +802,30 @@ export function evaluateWithTribunal(code, language, context, options) {
776
802
  // No triage data or error loading — continue without adjustment
777
803
  }
778
804
  const maxFindings = options?.maxFindingsPerFile ?? DEFAULT_MAX_FINDINGS_PER_FILE;
779
- const cappedFindings = applyPerFileFindingCap(triageAdjusted, maxFindings);
805
+ // ── Session feedback calibration ──
806
+ // Apply confidence penalties from accumulated FP feedback in the
807
+ // current evaluation session. This is the real-time agentic loop:
808
+ // user marks findings as FP → session records it → subsequent
809
+ // evaluations automatically reduce confidence on those rules.
810
+ let sessionAdjusted = triageAdjusted;
811
+ try {
812
+ const session = getGlobalSession();
813
+ const tally = session.getFeedbackTally();
814
+ if (tally.size > 0) {
815
+ sessionAdjusted = triageAdjusted.map((f) => {
816
+ const penalty = session.getConfidencePenalty(f.ruleId);
817
+ if (penalty < 1.0) {
818
+ const adjusted = clampConfidence((f.confidence ?? 0.5) * penalty);
819
+ return { ...f, confidence: adjusted };
820
+ }
821
+ return f;
822
+ });
823
+ }
824
+ }
825
+ catch {
826
+ // Session feedback calibration failure is non-fatal
827
+ }
828
+ const cappedFindings = applyPerFileFindingCap(sessionAdjusted, maxFindings);
780
829
  // ── Confidence-based tiering for progressive disclosure ──
781
830
  // Tag each finding with a disclosure tier so downstream consumers (CLI,
782
831
  // formatters, VS Code extension) can show only high-confidence findings
@@ -852,6 +901,23 @@ export function evaluateWithTribunal(code, language, context, options) {
852
901
  : "AI-generated code patterns detected — review for model-specific biases",
853
902
  };
854
903
  }
904
+ // ── Human escalation protocol ──
905
+ // Evaluate which findings need human review based on escalation policy.
906
+ // Enhances the review decision with escalation routing information.
907
+ if (options?.config?.escalationThreshold || options?.filePath) {
908
+ try {
909
+ const escalationPolicy = options?.config?.escalationThreshold
910
+ ? { confidenceThreshold: options.config.escalationThreshold }
911
+ : undefined;
912
+ const escalations = evaluateEscalations(result, options?.filePath ?? "<unknown>", escalationPolicy);
913
+ if (escalations.length > 0 && result.reviewDecision) {
914
+ result.reviewDecision = enhanceReviewWithEscalations(result.reviewDecision, escalations);
915
+ }
916
+ }
917
+ catch {
918
+ // Escalation evaluation failure is non-fatal
919
+ }
920
+ }
855
921
  // ── Disk cache: persist for future runs ──
856
922
  if (diskCache) {
857
923
  try {
@@ -863,6 +929,129 @@ export function evaluateWithTribunal(code, language, context, options) {
863
929
  }
864
930
  return result;
865
931
  }
932
+ // ─── Streaming Evaluation ────────────────────────────────────────────────────
933
+ /**
934
+ * Streaming tribunal evaluation — yields per-judge results as each judge
935
+ * completes, enabling progressive UI updates and early termination.
936
+ *
937
+ * Each yielded `StreamingBatch` contains the judge evaluation, execution
938
+ * trace, and running aggregate statistics.
939
+ *
940
+ * Usage:
941
+ * ```ts
942
+ * for await (const batch of evaluateWithTribunalStreaming(code, lang)) {
943
+ * console.log(`${batch.judgeName}: ${batch.evaluation.findings.length} findings`);
944
+ * if (batch.aggregate.criticalSoFar > 10) break; // early termination
945
+ * }
946
+ * ```
947
+ */
948
+ export async function* evaluateWithTribunalStreaming(code, language, context, options) {
949
+ const includeAst = options?.includeAstFindings ?? true;
950
+ const hash = contentHash(code, language);
951
+ let astResult = options?._astCache;
952
+ if (!astResult && includeAst) {
953
+ astResult = astStructureCache.get(hash);
954
+ if (!astResult) {
955
+ astResult = analyzeStructure(code, language);
956
+ astStructureCache.set(hash, astResult);
957
+ }
958
+ }
959
+ let taintResult = options?._taintFlows;
960
+ if (!taintResult) {
961
+ taintResult = taintFlowCache.get(hash);
962
+ if (!taintResult) {
963
+ taintResult = analyzeTaintFlows(code, language);
964
+ taintFlowCache.set(hash, taintResult);
965
+ }
966
+ }
967
+ const enrichedOptions = {
968
+ ...options,
969
+ ...(astResult ? { _astCache: astResult } : {}),
970
+ ...(taintResult ? { _taintFlows: taintResult } : {}),
971
+ };
972
+ const { judges, skipped: skippedJudges } = resolveJudgeSet(enrichedOptions);
973
+ const totalJudges = judges.length;
974
+ let completedJudges = 0;
975
+ let findingsSoFar = 0;
976
+ let criticalSoFar = 0;
977
+ let highSoFar = 0;
978
+ let scoreSum = 0;
979
+ let hasFailure = false;
980
+ let hasWarning = false;
981
+ for (const judge of judges) {
982
+ const start = performance.now();
983
+ const evaluation = evaluateWithJudge(judge, code, language, context, enrichedOptions);
984
+ const durationMs = Math.round(performance.now() - start);
985
+ evaluation.durationMs = durationMs;
986
+ completedJudges++;
987
+ findingsSoFar += evaluation.findings.length;
988
+ criticalSoFar += evaluation.findings.filter((f) => f.severity === "critical").length;
989
+ highSoFar += evaluation.findings.filter((f) => f.severity === "high").length;
990
+ scoreSum += evaluation.score;
991
+ if (evaluation.verdict === "fail")
992
+ hasFailure = true;
993
+ if (evaluation.verdict === "warning")
994
+ hasWarning = true;
995
+ const trace = {
996
+ judgeId: judge.id,
997
+ judgeName: judge.name,
998
+ durationMs,
999
+ rules: buildRuleTraces(evaluation),
1000
+ rawFindingCount: evaluation.findings.length,
1001
+ finalFindingCount: evaluation.findings.length,
1002
+ ...(astResult
1003
+ ? {
1004
+ astResolution: {
1005
+ functionsAnalyzed: astResult.functions.length,
1006
+ maxComplexity: Math.max(0, ...astResult.functions.map((f) => f.cyclomaticComplexity)),
1007
+ taintFlowsDetected: taintResult?.length ?? 0,
1008
+ },
1009
+ }
1010
+ : {}),
1011
+ };
1012
+ const currentVerdict = hasFailure ? "fail" : hasWarning ? "warning" : "pass";
1013
+ yield {
1014
+ judgeId: judge.id,
1015
+ judgeName: judge.name,
1016
+ evaluation,
1017
+ trace,
1018
+ aggregate: {
1019
+ completedJudges,
1020
+ totalJudges,
1021
+ findingsSoFar,
1022
+ criticalSoFar,
1023
+ highSoFar,
1024
+ currentScore: Math.round(scoreSum / completedJudges),
1025
+ currentVerdict,
1026
+ },
1027
+ done: completedJudges === totalJudges,
1028
+ };
1029
+ // Yield to the event loop between judges for responsiveness
1030
+ await new Promise((r) => setTimeout(r, 0));
1031
+ }
1032
+ }
1033
+ /**
1034
+ * Build rule-level traces from a judge evaluation for observability.
1035
+ */
1036
+ function buildRuleTraces(evaluation) {
1037
+ const ruleMap = new Map();
1038
+ for (const f of evaluation.findings) {
1039
+ const existing = ruleMap.get(f.ruleId);
1040
+ if (existing) {
1041
+ existing.count++;
1042
+ existing.peakConf = Math.max(existing.peakConf, f.confidence ?? 0.5);
1043
+ }
1044
+ else {
1045
+ ruleMap.set(f.ruleId, { count: 1, peakConf: f.confidence ?? 0.5 });
1046
+ }
1047
+ }
1048
+ return [...ruleMap.entries()].map(([ruleId, { count, peakConf }]) => ({
1049
+ ruleId,
1050
+ matched: true,
1051
+ findingCount: count,
1052
+ peakConfidence: peakConf,
1053
+ }));
1054
+ }
866
1055
  // ─── Project-level Multi-file Analysis (delegated to project.ts) ─────────────
867
1056
  import { evaluateProject as _evaluateProject } from "./project.js";
868
1057
  export { scanProjectWideSecurityPatterns } from "./project.js";
@@ -0,0 +1,19 @@
1
+ /**
2
+ * Adaptive judge selection — picks only the judges relevant to a given file
3
+ * based on language, framework, file role, and project context.
4
+ *
5
+ * Eliminates wasted work (e.g. running "testing" judge on a Dockerfile,
6
+ * or "iac-security" on a React component) while keeping the full panel
7
+ * available for explicit requests.
8
+ */
9
+ import type { JudgeDefinition, JudgeSelectionContext, JudgeSelectionResult } from "../types.js";
10
+ /**
11
+ * Select the most relevant judges for a given file context.
12
+ *
13
+ * Strategy:
14
+ * 1. Always include core judges (security, false-positive-review)
15
+ * 2. Skip judges with language incompatibility
16
+ * 3. Skip judges irrelevant to the file category
17
+ * 4. Return selection with skip reasons for observability
18
+ */
19
+ export declare function selectJudges(judges: JudgeDefinition[], ctx: JudgeSelectionContext): JudgeSelectionResult;
@@ -0,0 +1,141 @@
1
+ /**
2
+ * Adaptive judge selection — picks only the judges relevant to a given file
3
+ * based on language, framework, file role, and project context.
4
+ *
5
+ * Eliminates wasted work (e.g. running "testing" judge on a Dockerfile,
6
+ * or "iac-security" on a React component) while keeping the full panel
7
+ * available for explicit requests.
8
+ */
9
+ // ─── Language → judge relevance ──────────────────────────────────────────────
10
+ /**
11
+ * Judges that are ONLY relevant for specific language families.
12
+ * If the language isn't listed, the judge is skipped.
13
+ * Most judges are language-agnostic and not listed here.
14
+ */
15
+ const LANGUAGE_SPECIFIC = {
16
+ // IaC judges only apply to infrastructure languages
17
+ "iac-security": new Set(["terraform", "bicep", "arm", "dockerfile", "yaml"]),
18
+ };
19
+ /**
20
+ * Judges to SKIP for specific languages — inverse of above.
21
+ * E.g. testing patterns don't apply to SQL or Dockerfile.
22
+ */
23
+ const LANGUAGE_SKIP = {
24
+ testing: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "yaml"]),
25
+ documentation: new Set(["sql", "dockerfile", "terraform", "bicep", "arm"]),
26
+ "code-structure": new Set(["sql", "dockerfile", "yaml"]),
27
+ ux: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "bash", "powershell"]),
28
+ accessibility: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "bash", "powershell"]),
29
+ internationalization: new Set(["sql", "dockerfile", "terraform", "bicep", "arm"]),
30
+ concurrency: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "yaml"]),
31
+ "over-engineering": new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "yaml"]),
32
+ };
33
+ // ─── File category → judge relevance ────────────────────────────────────────
34
+ /**
35
+ * Judges to skip when evaluating test files — noise reduction.
36
+ */
37
+ const SKIP_FOR_TESTS = new Set([
38
+ "documentation",
39
+ "rate-limiting",
40
+ "scalability",
41
+ "cloud-readiness",
42
+ "ci-cd",
43
+ "configuration-management",
44
+ "cost-effectiveness",
45
+ "data-sovereignty",
46
+ "compliance",
47
+ "internationalization",
48
+ "ux",
49
+ "accessibility",
50
+ "observability",
51
+ ]);
52
+ /**
53
+ * Judges to skip for config/manifest files.
54
+ */
55
+ const SKIP_FOR_CONFIG = new Set([
56
+ "testing",
57
+ "documentation",
58
+ "code-structure",
59
+ "error-handling",
60
+ "performance",
61
+ "concurrency",
62
+ "scalability",
63
+ "ux",
64
+ "accessibility",
65
+ "internationalization",
66
+ "over-engineering",
67
+ "backwards-compatibility",
68
+ "maintainability",
69
+ ]);
70
+ /**
71
+ * Judges to skip for IaC files (Terraform, Bicep, ARM, Dockerfile).
72
+ */
73
+ const SKIP_FOR_IAC = new Set([
74
+ "testing",
75
+ "code-structure",
76
+ "concurrency",
77
+ "over-engineering",
78
+ "ux",
79
+ "accessibility",
80
+ "internationalization",
81
+ "api-design",
82
+ "api-contract",
83
+ "backwards-compatibility",
84
+ "hallucination-detection",
85
+ "multi-turn-coherence",
86
+ "model-fingerprint",
87
+ ]);
88
+ // ─── Core judges that always run ─────────────────────────────────────────────
89
+ /** These judges run unconditionally — they cover universally applicable concerns. */
90
+ const ALWAYS_RUN = new Set(["security", "cybersecurity", "false-positive-review"]);
91
+ // ─── Selection logic ─────────────────────────────────────────────────────────
92
+ /**
93
+ * Select the most relevant judges for a given file context.
94
+ *
95
+ * Strategy:
96
+ * 1. Always include core judges (security, false-positive-review)
97
+ * 2. Skip judges with language incompatibility
98
+ * 3. Skip judges irrelevant to the file category
99
+ * 4. Return selection with skip reasons for observability
100
+ */
101
+ export function selectJudges(judges, ctx) {
102
+ const selected = [];
103
+ const skipped = [];
104
+ const lang = ctx.language.toLowerCase();
105
+ const cat = ctx.fileCategory?.toLowerCase() ?? "";
106
+ for (const judge of judges) {
107
+ // Core judges always run
108
+ if (ALWAYS_RUN.has(judge.id)) {
109
+ selected.push(judge);
110
+ continue;
111
+ }
112
+ // Language-specific judge: skip if language not in its set
113
+ const langOnly = LANGUAGE_SPECIFIC[judge.id];
114
+ if (langOnly && !langOnly.has(lang)) {
115
+ skipped.push({ judgeId: judge.id, reason: `not relevant for language: ${lang}` });
116
+ continue;
117
+ }
118
+ // Language skip: judge not useful for this language
119
+ const langSkip = LANGUAGE_SKIP[judge.id];
120
+ if (langSkip && langSkip.has(lang)) {
121
+ skipped.push({ judgeId: judge.id, reason: `skipped for language: ${lang}` });
122
+ continue;
123
+ }
124
+ // File category gating
125
+ if (cat === "test" && SKIP_FOR_TESTS.has(judge.id)) {
126
+ skipped.push({ judgeId: judge.id, reason: "not relevant for test files" });
127
+ continue;
128
+ }
129
+ if (cat === "config" && SKIP_FOR_CONFIG.has(judge.id)) {
130
+ skipped.push({ judgeId: judge.id, reason: "not relevant for config files" });
131
+ continue;
132
+ }
133
+ if ((cat === "iac" || lang === "terraform" || lang === "bicep" || lang === "arm" || lang === "dockerfile") &&
134
+ SKIP_FOR_IAC.has(judge.id)) {
135
+ skipped.push({ judgeId: judge.id, reason: "not relevant for infrastructure code" });
136
+ continue;
137
+ }
138
+ selected.push(judge);
139
+ }
140
+ return { selected, skipped };
141
+ }
@@ -0,0 +1,27 @@
1
+ /**
2
+ * Recall Booster — Additional detection patterns for weak-recall categories
3
+ *
4
+ * This module provides supplementary pattern detection for judge categories
5
+ * where the deterministic evaluators have recall below 85%. It acts as
6
+ * a second-pass augmentation applied after the primary evaluator.
7
+ *
8
+ * Categories strengthened (by recall gap analysis):
9
+ * - hallucination-detection (46.2% → improved)
10
+ * - ci-cd (41.7% → improved)
11
+ * - internationalization (42.9% → improved)
12
+ * - cost-effectiveness (57.1% → improved)
13
+ * - documentation (63.6% → improved)
14
+ * - iac-security (66.7% → improved)
15
+ * - cloud/cloud-readiness (50-73% → improved)
16
+ */
17
+ import type { Finding } from "../types.js";
18
+ interface BoostResult {
19
+ findings: Finding[];
20
+ boostedCategories: string[];
21
+ }
22
+ /**
23
+ * Apply recall-boosting patterns to detect issues that primary evaluators miss.
24
+ * Returns additional findings (does not modify existing ones).
25
+ */
26
+ export declare function applyRecallBoost(code: string, language: string): BoostResult;
27
+ export {};