@kevinrabun/judges 3.113.0 → 3.115.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/README.md +9 -0
  2. package/agents/accessibility.judge.md +37 -0
  3. package/agents/agent-instructions.judge.md +37 -0
  4. package/agents/ai-code-safety.judge.md +48 -0
  5. package/agents/api-contract.judge.md +30 -0
  6. package/agents/api-design.judge.md +39 -0
  7. package/agents/authentication.judge.md +37 -0
  8. package/agents/backwards-compatibility.judge.md +37 -0
  9. package/agents/caching.judge.md +37 -0
  10. package/agents/ci-cd.judge.md +37 -0
  11. package/agents/cloud-readiness.judge.md +37 -0
  12. package/agents/code-structure.judge.md +48 -0
  13. package/agents/compliance.judge.md +40 -0
  14. package/agents/concurrency.judge.md +39 -0
  15. package/agents/configuration-management.judge.md +37 -0
  16. package/agents/cost-effectiveness.judge.md +40 -0
  17. package/agents/cybersecurity.judge.md +36 -0
  18. package/agents/data-security.judge.md +34 -0
  19. package/agents/data-sovereignty.judge.md +58 -0
  20. package/agents/database.judge.md +41 -0
  21. package/agents/dependency-health.judge.md +39 -0
  22. package/agents/documentation.judge.md +39 -0
  23. package/agents/error-handling.judge.md +37 -0
  24. package/agents/ethics-bias.judge.md +39 -0
  25. package/agents/false-positive-review.judge.md +73 -0
  26. package/agents/framework-safety.judge.md +40 -0
  27. package/agents/hallucination-detection.judge.md +33 -0
  28. package/agents/iac-security.judge.md +38 -0
  29. package/agents/intent-alignment.judge.md +31 -0
  30. package/agents/internationalization.judge.md +42 -0
  31. package/agents/logging-privacy.judge.md +37 -0
  32. package/agents/logic-review.judge.md +34 -0
  33. package/agents/maintainability.judge.md +37 -0
  34. package/agents/model-fingerprint.judge.md +31 -0
  35. package/agents/multi-turn-coherence.judge.md +29 -0
  36. package/agents/observability.judge.md +37 -0
  37. package/agents/over-engineering.judge.md +48 -0
  38. package/agents/performance.judge.md +44 -0
  39. package/agents/portability.judge.md +37 -0
  40. package/agents/rate-limiting.judge.md +37 -0
  41. package/agents/reliability.judge.md +39 -0
  42. package/agents/scalability.judge.md +41 -0
  43. package/agents/security.judge.md +31 -0
  44. package/agents/software-practices.judge.md +44 -0
  45. package/agents/testing.judge.md +39 -0
  46. package/agents/ux.judge.md +37 -0
  47. package/dist/api.d.ts +9 -1
  48. package/dist/api.js +9 -1
  49. package/dist/commands/fix.d.ts +10 -0
  50. package/dist/commands/fix.js +52 -0
  51. package/dist/commands/llm-benchmark.d.ts +13 -4
  52. package/dist/commands/llm-benchmark.js +39 -8
  53. package/dist/commands/review.d.ts +51 -1
  54. package/dist/commands/review.js +213 -7
  55. package/dist/evaluators/index.js +61 -35
  56. package/dist/github-app.d.ts +35 -0
  57. package/dist/github-app.js +125 -4
  58. package/dist/judges/index.d.ts +23 -61
  59. package/dist/judges/index.js +49 -63
  60. package/dist/patches/apply.d.ts +15 -0
  61. package/dist/patches/apply.js +37 -0
  62. package/dist/tools/prompts.d.ts +2 -2
  63. package/dist/tools/prompts.js +21 -10
  64. package/docs/skills.md +7 -0
  65. package/package.json +18 -3
  66. package/packages/judges-cli/README.md +24 -0
  67. package/packages/judges-cli/bin/judges.js +8 -0
  68. package/scripts/generate-agents-from-judges.ts +111 -0
  69. package/scripts/generate-skills-docs.ts +26 -0
  70. package/scripts/validate-agents.ts +104 -0
  71. package/server.json +2 -2
  72. package/skills/ai-code-review.skill.md +57 -0
  73. package/skills/release-gate.skill.md +27 -0
  74. package/skills/security-review.skill.md +32 -0
  75. package/src/agent-loader.ts +324 -0
  76. package/src/skill-loader.ts +199 -0
@@ -0,0 +1,39 @@
1
+ ---
2
+ id: testing
3
+ name: Judge Testing
4
+ domain: Test Quality & Coverage
5
+ rulePrefix: TEST
6
+ description: Evaluates code for test-to-code ratio, test isolation, mocking strategy, edge case coverage, flaky test patterns, and test pyramid balance (unit/integration/e2e).
7
+ tableDescription: Test coverage, assertions, test isolation, naming
8
+ promptDescription: Deep testing quality review
9
+ script: ../src/evaluators/testing.ts
10
+ priority: 10
11
+ ---
12
+ You are Judge Testing — a quality engineering architect with mastery of TDD, BDD, testing pyramids, mutation testing, and test infrastructure at scale.
13
+
14
+ YOUR EVALUATION CRITERIA:
15
+ 1. **Testability**: Is the code structured for easy testing? Are dependencies injectable? Are side effects isolated? Is business logic separated from I/O?
16
+ 2. **Test Pyramid Balance**: Is there an appropriate mix of unit tests (many), integration tests (some), and E2E tests (few)? Are tests at the right level?
17
+ 3. **Edge Cases**: Are boundary conditions tested (empty arrays, null inputs, max values, concurrent access, unicode, timezone boundaries)?
18
+ 4. **Mocking Strategy**: Are mocks/stubs/spies used appropriately? Is there over-mocking (mocking implementation details rather than contracts)? Are test doubles faithful to real behavior?
19
+ 5. **Test Isolation**: Do tests depend on each other's state? Is there shared mutable state between tests? Do tests clean up after themselves?
20
+ 6. **Flaky Test Patterns**: Are there patterns that could cause flaky tests (timing dependencies, random data without seeds, file system access, network calls)?
21
+ 7. **Assertion Quality**: Are assertions specific and meaningful? Do they test behavior rather than implementation? Are error messages in assertions helpful?
22
+ 8. **Test Naming & Organization**: Do test names describe the behavior being tested? Are tests organized by feature/behavior rather than by class?
23
+ 9. **Error Path Testing**: Are error conditions and exception paths tested? Are failure modes verified, not just success paths?
24
+ 10. **Performance Testing**: Are there tests for response time, throughput, or resource usage? Are performance baselines established?
25
+ 11. **Security Testing**: Are there tests for authentication, authorization, input validation, and injection attempts?
26
+ 12. **Test Data Management**: Is test data created programmatically? Are fixtures/factories used instead of hardcoded data? Is sensitive data avoided in tests?
27
+
28
+ RULES FOR YOUR EVALUATION:
29
+ - Assign rule IDs with prefix "TEST-" (e.g. TEST-001).
30
+ - Reference testing best practices (Kent Beck, Martin Fowler's Test Pyramid, FIRST principles).
31
+ - Recommend specific test cases that should be written, with example test code.
32
+ - Evaluate both the tests AND the testability of the code under test.
33
+ - Score from 0-100 where 100 means comprehensive, well-structured test suite.
34
+
35
+ ADVERSARIAL MANDATE:
36
+ - Your role is adversarial: assume the test coverage is insufficient and actively hunt for gaps. Back every finding with concrete code evidence (line numbers, patterns, API calls).
37
+ - Never praise or compliment the code. Report only problems, risks, and deficiencies.
38
+ - If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
39
+ - Absence of findings does not mean the code is well-tested. It means your analysis reached its limits. State this explicitly.
@@ -0,0 +1,37 @@
1
+ ---
2
+ id: ux
3
+ name: Judge UX
4
+ domain: User Experience & Interface Quality
5
+ rulePrefix: UX
6
+ description: Evaluates code for user experience patterns including loading states, error feedback, responsive design, mobile-friendliness, and interaction quality.
7
+ tableDescription: Loading states, error messages, pagination, destructive actions
8
+ promptDescription: Deep user experience review
9
+ script: ../src/evaluators/ux.ts
10
+ priority: 10
11
+ ---
12
+ You are Judge UX — a UX engineer and frontend architect who bridges design and engineering, specializing in performance perception, error communication, and inclusive interaction design.
13
+
14
+ YOUR EVALUATION CRITERIA:
15
+ 1. **Loading States**: Are loading indicators shown during async operations? Is there feedback when the user initiates an action? Are skeleton screens or spinners used?
16
+ 2. **Error Feedback**: Are errors communicated to users in a clear, actionable way? Are generic "Something went wrong" messages avoided? Do errors suggest next steps?
17
+ 3. **Responsive Design**: Does the UI adapt to different screen sizes? Are media queries or responsive frameworks used? Is content readable on mobile?
18
+ 4. **Form UX**: Are forms validated with inline feedback? Are error messages placed near the relevant field? Are required fields marked? Is there auto-save or draft preservation?
19
+ 5. **Navigation & Wayfinding**: Is navigation intuitive? Are breadcrumbs provided? Can users always find their way back? Are deep links supported?
20
+ 6. **Performance Perception**: Are optimistic updates used? Is there pagination or infinite scroll for large lists? Are perceived loading times minimized?
21
+ 7. **Empty States**: Are empty states handled (no data, no results, first-time user)? Do they provide guidance on what to do next?
22
+ 8. **Confirmation & Safety**: Are destructive actions confirmed (delete, submit, send)? Can actions be undone? Are users warned about data loss?
23
+ 9. **Mobile & Touch**: Are touch targets large enough (48x48px)? Are hover-dependent interactions avoided? Is the interface usable without a mouse?
24
+ 10. **Progressive Enhancement**: Does the core functionality work without JavaScript? Are there graceful fallbacks for unsupported features?
25
+
26
+ RULES FOR YOUR EVALUATION:
27
+ - Assign rule IDs with prefix "UX-" (e.g. UX-001).
28
+ - Reference Nielsen's Heuristics, Material Design guidelines, and WCAG criteria where applicable.
29
+ - Distinguish between "functional" and "user-friendly."
30
+ - Consider diverse users: slow connections, small screens, assistive technology.
31
+ - Score from 0-100 where 100 means excellent user experience.
32
+
33
+ ADVERSARIAL MANDATE:
34
+ - Your role is adversarial: assume the user experience is poor and actively hunt for problems. Back every finding with concrete code evidence (line numbers, patterns, API calls).
35
+ - Never praise or compliment the code. Report only problems, risks, and deficiencies.
36
+ - If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
37
+ - Absence of findings does not mean the UX is good. It means your analysis reached its limits. State this explicitly.
package/dist/api.d.ts CHANGED
@@ -31,6 +31,10 @@ export { verdictToGitHubActions } from "./formatters/github-actions.js";
31
31
  export { registerPlugin, unregisterPlugin, getRegisteredPlugins, getCustomRules, getPluginJudges, evaluateCustomRules, runBeforeHooks, runAfterHooks, clearPlugins, } from "./plugins.js";
32
32
  export type { CustomRule, JudgesPlugin, PluginRegistration } from "./plugins.js";
33
33
  export { JudgeRegistry, defaultRegistry } from "./judge-registry.js";
34
+ export { parseFrontmatter, validateFrontmatter, parseAgentFile, resolveEvaluator, agentToJudgeDefinition, loadAgentDirectory, loadAndRegisterAgents, } from "./agent-loader.js";
35
+ export type { AgentFrontmatter, ParsedAgent } from "./agent-loader.js";
36
+ export { parseSkillFrontmatter, validateSkillFrontmatter, parseSkillFile, loadSkillDirectory, listSkills, runSkill, } from "./skill-loader.js";
37
+ export type { SkillFrontmatter, ParsedSkill } from "./skill-loader.js";
34
38
  export { fingerprintCode, fingerprintToFindings } from "./fingerprint.js";
35
39
  export type { AiFingerprint, AiSignal } from "./fingerprint.js";
36
40
  export { buildCalibrationProfile, calibrateFindings, autoCalibrateFindings } from "./calibration.js";
@@ -49,8 +53,12 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
49
53
  export type { ToolProfile, ToolCapability, ComparisonResult } from "./comparison.js";
50
54
  export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
51
55
  export type { BenchmarkCase, BenchmarkResult, BenchmarkGateOptions, BenchmarkGateResult, L2CoverageAnalysis, L2JudgeCoverage, L2CategoryCoverage, } from "./commands/benchmark.js";
52
- export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, } from "./commands/llm-benchmark.js";
56
+ export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, } from "./commands/llm-benchmark.js";
53
57
  export type { LlmBenchmarkSnapshot, LlmCaseResult } from "./commands/llm-benchmark.js";
58
+ export type { LlmFinding, ValidationResult } from "./probabilistic/llm-response-validator.js";
59
+ export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
60
+ export { buildContextSnippets } from "./context/context-snippets.js";
61
+ export { EmbeddingCache, FallbackEmbeddingProvider, getOrCreateEmbedding } from "./context/embedding-cache.js";
54
62
  export { exportTeamConfig, importTeamConfig, pullRemoteConfig, writePolicyLock, readPolicyLock, validatePolicyCompliance, } from "./commands/config-share.js";
55
63
  export type { TeamConfig, PolicyLock, PolicyValidationResult } from "./commands/config-share.js";
56
64
  export { getLanguagePack, listLanguagePacks, suggestPack, LANGUAGE_PACKS } from "./commands/language-packs.js";
package/dist/api.js CHANGED
@@ -38,6 +38,10 @@ export { verdictToGitHubActions } from "./formatters/github-actions.js";
38
38
  export { registerPlugin, unregisterPlugin, getRegisteredPlugins, getCustomRules, getPluginJudges, evaluateCustomRules, runBeforeHooks, runAfterHooks, clearPlugins, } from "./plugins.js";
39
39
  // ─── Judge Registry ──────────────────────────────────────────────────────────
40
40
  export { JudgeRegistry, defaultRegistry } from "./judge-registry.js";
41
+ // ─── Agent Markdown Loader ───────────────────────────────────────────────────
42
+ export { parseFrontmatter, validateFrontmatter, parseAgentFile, resolveEvaluator, agentToJudgeDefinition, loadAgentDirectory, loadAndRegisterAgents, } from "./agent-loader.js";
43
+ // ─── Skill Loader ───────────────────────────────────────────────────────────
44
+ export { parseSkillFrontmatter, validateSkillFrontmatter, parseSkillFile, loadSkillDirectory, listSkills, runSkill, } from "./skill-loader.js";
41
45
  // ─── AI Code Fingerprinting ─────────────────────────────────────────────────
42
46
  export { fingerprintCode, fingerprintToFindings } from "./fingerprint.js";
43
47
  // ─── Confidence Calibration ─────────────────────────────────────────────────
@@ -56,7 +60,11 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
56
60
  // ─── Benchmark Gate ──────────────────────────────────────────────────────────
57
61
  export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
58
62
  // ─── LLM Benchmark ──────────────────────────────────────────────────────────
59
- export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, } from "./commands/llm-benchmark.js";
63
+ export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, } from "./commands/llm-benchmark.js";
64
+ // Review autopilot (GitHub App / scripts)
65
+ export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
66
+ export { buildContextSnippets } from "./context/context-snippets.js";
67
+ export { EmbeddingCache, FallbackEmbeddingProvider, getOrCreateEmbedding } from "./context/embedding-cache.js";
60
68
  // ─── Config Sharing & Policy ─────────────────────────────────────────────────
61
69
  export { exportTeamConfig, importTeamConfig, pullRemoteConfig, writePolicyLock, readPolicyLock, validatePolicyCompliance, } from "./commands/config-share.js";
62
70
  // ─── Language Packs ──────────────────────────────────────────────────────────
@@ -53,6 +53,14 @@ export declare function applyPatches(code: string, patches: PatchCandidate[]): {
53
53
  skipped: number;
54
54
  overlapped: number;
55
55
  };
56
+ /**
57
+ * Generate a unified diff for a single patch candidate. Useful for git apply fallback.
58
+ */
59
+ export declare function toUnifiedDiff(filePath: string, patch: Patch): string;
60
+ /**
61
+ * Try to apply a patch via `git apply` (with 3-way merge) as a fallback when direct text replacement fails.
62
+ */
63
+ export declare function tryGitApply(filePath: string, patch: Patch): boolean;
56
64
  /** A group of patches scoped to a single file within a cross-file fix set. */
57
65
  export interface FilePatchGroup {
58
66
  /** Absolute or relative file path. */
@@ -92,6 +100,7 @@ export declare function applyPatchSet(patchSet: PatchSet, options?: {
92
100
  apply?: boolean;
93
101
  filter?: PatchFilter;
94
102
  basePath?: string;
103
+ gitApplyFallback?: boolean;
95
104
  }): PatchSetResult;
96
105
  interface FixArgs {
97
106
  file: string | undefined;
@@ -101,6 +110,7 @@ interface FixArgs {
101
110
  severity: string | undefined;
102
111
  lines: string | undefined;
103
112
  apply: boolean;
113
+ gitApplyFallback?: boolean;
104
114
  }
105
115
  export declare function parseFixArgs(argv: string[]): FixArgs;
106
116
  export declare function runFix(argv: string[]): void;
@@ -134,6 +134,44 @@ export function applyPatches(code, patches) {
134
134
  }
135
135
  return { result: lines.join("\n"), applied, skipped, overlapped };
136
136
  }
137
+ /**
138
+ * Generate a unified diff for a single patch candidate. Useful for git apply fallback.
139
+ */
140
+ export function toUnifiedDiff(filePath, patch) {
141
+ const header = `--- a/${filePath}\n+++ b/${filePath}`;
142
+ const hunkHeader = `@@ -${patch.startLine},${patch.endLine - patch.startLine + 1} +${patch.startLine},${patch.endLine - patch.startLine + 1} @@`;
143
+ const oldLines = patch.oldText.split(/\r?\n/).map((l) => `-${l}`);
144
+ const newLines = patch.newText.split(/\r?\n/).map((l) => `+${l}`);
145
+ return [header, hunkHeader, ...oldLines, ...newLines].join("\n") + "\n";
146
+ }
147
+ /**
148
+ * Try to apply a patch via `git apply` (with 3-way merge) as a fallback when direct text replacement fails.
149
+ */
150
+ export function tryGitApply(filePath, patch) {
151
+ try {
152
+ const diff = toUnifiedDiff(filePath, patch);
153
+ const { execFileSync } = require("node:child_process");
154
+ const { tmpdir } = require("node:os");
155
+ const { writeFileSync, unlinkSync } = require("node:fs");
156
+ const tmp = require("node:path").join(tmpdir(), `.judges-patch-${Date.now()}.diff`);
157
+ writeFileSync(tmp, diff, "utf-8");
158
+ try {
159
+ execFileSync("git", ["apply", "--3way", "--reject", tmp], { stdio: "ignore" });
160
+ return true;
161
+ }
162
+ finally {
163
+ try {
164
+ unlinkSync(tmp);
165
+ }
166
+ catch {
167
+ // ignore
168
+ }
169
+ }
170
+ }
171
+ catch {
172
+ return false;
173
+ }
174
+ }
137
175
  /**
138
176
  * Collect patches from findings across multiple files into a PatchSet.
139
177
  * Groups findings by their associated file path.
@@ -198,6 +236,16 @@ export function applyPatchSet(patchSet, options = {}) {
198
236
  if (options.apply && applied > 0) {
199
237
  writeFileSync(absPath, result, "utf-8");
200
238
  }
239
+ // If we skipped patches due to mismatched context, optionally try git apply fallback
240
+ if (options.apply && skipped > 0 && options.gitApplyFallback) {
241
+ for (const p of patches) {
242
+ const ok = tryGitApply(group.filePath, p.patch);
243
+ if (ok) {
244
+ results.totalApplied++;
245
+ results.totalSkipped = Math.max(0, results.totalSkipped - 1);
246
+ }
247
+ }
248
+ }
201
249
  results.files.push({ filePath: group.filePath, applied, skipped, overlapped });
202
250
  results.totalApplied += applied;
203
251
  results.totalSkipped += skipped;
@@ -214,6 +262,7 @@ export function parseFixArgs(argv) {
214
262
  severity: undefined,
215
263
  lines: undefined,
216
264
  apply: false,
265
+ gitApplyFallback: false,
217
266
  };
218
267
  for (let i = 3; i < argv.length; i++) {
219
268
  // skip node, script, "fix"
@@ -223,6 +272,9 @@ export function parseFixArgs(argv) {
223
272
  case "-a":
224
273
  args.apply = true;
225
274
  break;
275
+ case "--git-apply":
276
+ args.gitApplyFallback = true;
277
+ break;
226
278
  case "--language":
227
279
  case "-l":
228
280
  args.language = argv[++i];
@@ -8,8 +8,10 @@
8
8
  * - Prompt construction (mirrors MCP-served prompts exactly)
9
9
  * - Scoring logic (same methodology as L1 deterministic benchmark)
10
10
  *
11
- * The actual LLM API calls live in scripts/run-llm-benchmark.ts,
12
- * keeping the npm package free of LLM API dependencies.
11
+ * LLM API calling is intentionally kept out of the npm package. Wire this
12
+ * to your preferred provider in a thin runner script (or use the CLI
13
+ * command `judges llm-benchmark`). The former helper script
14
+ * `scripts/run-llm-benchmark.ts` has been removed.
13
15
  */
14
16
  import type { JudgeDefinition } from "../types.js";
15
17
  import type { BenchmarkCase, CategoryResult, JudgeBenchmarkResult, DifficultyResult } from "./benchmark.js";
@@ -77,15 +79,22 @@ export interface LlmCaseResult {
77
79
  * Extract unique rule IDs from LLM response text.
78
80
  * Matches patterns like CYBER-001, SEC-003, AUTH-001, etc.
79
81
  */
82
+ export declare function getValidRulePrefixes(): Set<string>;
80
83
  export declare function parseLlmRuleIds(response: string): string[];
84
+ /**
85
+ * Preferred entrypoint: extract findings from raw LLM text with validation. Falls back to regex rule-id scan.
86
+ */
87
+ export declare function extractValidatedLlmFindings(response: string, prefixes?: Set<string>): import("../probabilistic/llm-response-validator.js").ValidationResult;
81
88
  /**
82
89
  * Construct a per-judge prompt — identical to the MCP-served `judge-{id}` prompt.
90
+ * Uses condensed criteria (adversarial mandate stripped) plus shared mandates,
91
+ * mirroring the tribunal architecture for consistency and better precision.
83
92
  */
84
- export declare function constructPerJudgePrompt(judge: JudgeDefinition, code: string, language: string): string;
93
+ export declare function constructPerJudgePrompt(judge: JudgeDefinition, code: string, language: string, contextSnippets?: string[]): string;
85
94
  /**
86
95
  * Construct the full-tribunal prompt — identical to the MCP-served `full-tribunal` prompt.
87
96
  */
88
- export declare function constructTribunalPrompt(code: string, language: string): string;
97
+ export declare function constructTribunalPrompt(code: string, language: string, contextSnippets?: string[]): string;
89
98
  /**
90
99
  * Select a stratified sample of benchmark cases, ensuring representation
91
100
  * across categories, difficulties, and both clean/dirty cases.
@@ -8,18 +8,24 @@
8
8
  * - Prompt construction (mirrors MCP-served prompts exactly)
9
9
  * - Scoring logic (same methodology as L1 deterministic benchmark)
10
10
  *
11
- * The actual LLM API calls live in scripts/run-llm-benchmark.ts,
12
- * keeping the npm package free of LLM API dependencies.
11
+ * LLM API calling is intentionally kept out of the npm package. Wire this
12
+ * to your preferred provider in a thin runner script (or use the CLI
13
+ * command `judges llm-benchmark`). The former helper script
14
+ * `scripts/run-llm-benchmark.ts` has been removed.
13
15
  */
14
16
  import { JUDGES } from "../judges/index.js";
15
17
  import { getCondensedCriteria, SHARED_ADVERSARIAL_MANDATE, PRECISION_MANDATE } from "../tools/prompts.js";
18
+ import { extractAndValidateLlmFindings, mergeFindings } from "../probabilistic/llm-response-validator.js";
16
19
  // ─── Rule ID Parsing ────────────────────────────────────────────────────────
17
20
  /**
18
21
  * Extract unique rule IDs from LLM response text.
19
22
  * Matches patterns like CYBER-001, SEC-003, AUTH-001, etc.
20
23
  */
24
+ export function getValidRulePrefixes() {
25
+ return new Set(JUDGES.map((j) => j.rulePrefix));
26
+ }
21
27
  export function parseLlmRuleIds(response) {
22
- const validPrefixes = new Set(JUDGES.map((j) => j.rulePrefix));
28
+ const validPrefixes = getValidRulePrefixes();
23
29
  const pattern = /\b([A-Z]{2,})-(\d{3})\b/g;
24
30
  const found = new Set();
25
31
  let match;
@@ -30,33 +36,59 @@ export function parseLlmRuleIds(response) {
30
36
  }
31
37
  return [...found];
32
38
  }
39
+ /**
40
+ * Preferred entrypoint: extract findings from raw LLM text with validation. Falls back to regex rule-id scan.
41
+ */
42
+ export function extractValidatedLlmFindings(response, prefixes) {
43
+ const validPrefixes = prefixes ?? getValidRulePrefixes();
44
+ const primary = extractAndValidateLlmFindings(response, validPrefixes);
45
+ // Fallback regex scan (for unstructured responses)
46
+ const fallbackRuleIds = parseLlmRuleIds(response);
47
+ return mergeFindings(primary, fallbackRuleIds);
48
+ }
33
49
  // ─── Prompt Construction ────────────────────────────────────────────────────
34
50
  // These construct the exact same prompts served via MCP, ensuring the
35
51
  // benchmark tests the same prompts real users experience.
36
52
  // ─────────────────────────────────────────────────────────────────────────────
37
53
  /**
38
54
  * Construct a per-judge prompt — identical to the MCP-served `judge-{id}` prompt.
55
+ * Uses condensed criteria (adversarial mandate stripped) plus shared mandates,
56
+ * mirroring the tribunal architecture for consistency and better precision.
39
57
  */
40
- export function constructPerJudgePrompt(judge, code, language) {
41
- return (`${judge.systemPrompt}\n\n${PRECISION_MANDATE}\n\n` +
58
+ export function constructPerJudgePrompt(judge, code, language, contextSnippets = []) {
59
+ const persona = judge.systemPrompt.substring(0, judge.systemPrompt.indexOf("\n\n"));
60
+ const criteria = getCondensedCriteria(judge.systemPrompt);
61
+ const contextSection = contextSnippets.length
62
+ ? `## Repository Context\n\n${contextSnippets.map((s) => `- ${s.replace(/\n/g, " ")}`).join("\n")}\n\n`
63
+ : "";
64
+ return (`${persona}\n\n` +
65
+ `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
66
+ `${PRECISION_MANDATE}\n\n` +
67
+ contextSection +
68
+ `${criteria}\n\n` +
42
69
  `Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\`` +
43
- `\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. End with an overall score (0-100) and verdict (pass/warning/fail).`);
70
+ `\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. If no issues meet the confidence threshold, report zero findings explicitly. End with an overall score (0-100) and verdict (pass/warning/fail).`);
44
71
  }
45
72
  /**
46
73
  * Construct the full-tribunal prompt — identical to the MCP-served `full-tribunal` prompt.
47
74
  */
48
- export function constructTribunalPrompt(code, language) {
75
+ export function constructTribunalPrompt(code, language, contextSnippets = []) {
49
76
  const judgeInstructions = JUDGES.map((j) => `### ${j.name} — ${j.domain}\n**Rule prefix:** \`${j.rulePrefix}-\`\n\n${getCondensedCriteria(j.systemPrompt)}`).join("\n\n---\n\n");
77
+ const contextSection = contextSnippets.length
78
+ ? `## Repository Context\n\n${contextSnippets.map((s) => `- ${s.replace(/\n/g, " ")}`).join("\n")}\n\n`
79
+ : "";
50
80
  return (`You are the Judges Panel — a panel of ${JUDGES.length} expert judges who independently evaluate code for quality, security, and operational readiness.\n\n` +
51
81
  `## Universal Evaluation Directives\n\n` +
52
82
  `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
53
83
  `${PRECISION_MANDATE}\n\n` +
84
+ contextSection +
54
85
  `## Evaluation Instructions\n\n` +
55
86
  `Evaluate the following ${language} code from the perspective of ALL ${JUDGES.length} judges below. For each judge, provide:\n` +
56
87
  `1. Judge name and domain\n` +
57
88
  `2. Verdict (PASS / WARNING / FAIL)\n` +
58
89
  `3. Score (0-100)\n` +
59
90
  `4. Specific findings with rule IDs (using each judge's rule prefix), severity, and recommendations\n\n` +
91
+ `For judges where no issues meet the confidence threshold, report a PASS verdict with zero findings.\n\n` +
60
92
  `Then provide an OVERALL TRIBUNAL VERDICT that synthesizes all judges' input.\n\n` +
61
93
  `## The Judges\n\n${judgeInstructions}\n\n` +
62
94
  `## Code to Evaluate\n\n\`\`\`${language}\n${code}\n\`\`\``);
@@ -125,7 +157,6 @@ export function selectStratifiedSample(cases, targetSize) {
125
157
  * Returns a fully populated LlmCaseResult.
126
158
  */
127
159
  export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
128
- const _expectedPrefixes = new Set(tc.expectedRuleIds.map((r) => r.split("-")[0]));
129
160
  const detectedPrefixes = new Set(detectedRuleIds.map((r) => r.split("-")[0]));
130
161
  const matchedExpected = tc.expectedRuleIds.filter((expected) => {
131
162
  const prefix = expected.split("-")[0];
@@ -13,6 +13,8 @@
13
13
  *
14
14
  * Requires: GITHUB_TOKEN environment variable (or gh CLI authenticated).
15
15
  */
16
+ import { evaluateDiff } from "../evaluators/index.js";
17
+ export declare function __setEvaluateDiffImplForTest(fn: typeof evaluateDiff | undefined): void;
16
18
  import type { Finding, Severity } from "../types.js";
17
19
  interface ReviewArgs {
18
20
  pr: number;
@@ -35,6 +37,21 @@ interface ReviewArgs {
35
37
  crossFile: boolean;
36
38
  /** Only run these judges (comma-separated IDs). All others are disabled. */
37
39
  judges?: string[];
40
+ /** Enable Layer 2 (LLM) deep review augmentation */
41
+ llmDeepReview?: boolean;
42
+ /** OpenAI-compatible model name (e.g., gpt-4o) */
43
+ llmModel?: string;
44
+ /** OpenAI-compatible base URL override */
45
+ llmBaseUrl?: string;
46
+ /** Max tokens for LLM responses */
47
+ llmMaxTokens?: number;
48
+ /** Enable autopilot: fetch diff, post inline comments, and summary automatically */
49
+ autopilot?: boolean;
50
+ }
51
+ interface PrFile {
52
+ filename: string;
53
+ status: "added" | "modified" | "removed" | "renamed";
54
+ patch?: string;
38
55
  }
39
56
  interface ReviewComment {
40
57
  path: string;
@@ -46,12 +63,31 @@ interface ReviewComment {
46
63
  /** Side for start line (always RIGHT for new code) */
47
64
  start_side?: "RIGHT";
48
65
  }
66
+ export declare function dedupeComments(comments: ReviewComment[]): ReviewComment[];
67
+ export declare function filterAlreadyPostedComments(repo: string, pr: number, token: string | undefined, comments: ReviewComment[]): ReviewComment[];
49
68
  interface DiffHunk {
50
69
  filePath: string;
51
70
  newContent: string;
52
71
  changedLines: number[];
53
72
  }
54
73
  export declare function parsePatchToHunk(filePath: string, patch: string): DiffHunk;
74
+ declare function ghApiRequest(method: string, endpoint: string, token: string, body?: unknown): {
75
+ status: number;
76
+ data: unknown;
77
+ };
78
+ export declare function __setApiRequestImplForTest(fn: typeof ghApiRequest | undefined): void;
79
+ interface LlmClientOptions {
80
+ model: string;
81
+ baseUrl?: string;
82
+ apiKey: string;
83
+ maxTokens?: number;
84
+ }
85
+ declare function callOpenAiChat(prompt: string, opts: LlmClientOptions): Promise<string>;
86
+ export declare function __setCallOpenAiChatImplForTest(fn: typeof callOpenAiChat): void;
87
+ export declare function runLlmDeepReview(prFiles: PrFile[], args: ReviewArgs): Promise<{
88
+ summary?: string;
89
+ warnings?: string[];
90
+ }>;
55
91
  export declare function findingToCommentBody(finding: Finding, fpRate?: number): string;
56
92
  interface ReviewResult {
57
93
  filesAnalyzed: number;
@@ -64,6 +100,9 @@ interface ReviewResult {
64
100
  fpSuppressed: number;
65
101
  approved: boolean;
66
102
  comments: ReviewComment[];
103
+ /** Optional LLM deep review summary (non-inline). */
104
+ llmSummary?: string;
105
+ llmWarnings?: string[];
67
106
  }
68
107
  /**
69
108
  * Build a rich PR-level review narrative with executive summary, per-file
@@ -101,5 +140,16 @@ export declare function assessReviewCompleteness(prFiles: Array<{
101
140
  calibrated?: boolean;
102
141
  }): ReviewCompleteness;
103
142
  export declare function parseReviewArgs(argv: string[]): ReviewArgs;
104
- export declare function runReview(argv: string[]): void;
143
+ export declare function runReview(argv: string[]): Promise<void>;
144
+ /**
145
+ * Programmatic autopilot entrypoint for GitHub App / automations.
146
+ */
147
+ export declare function runReviewAutopilot(pr: number, repo?: string): Promise<void>;
148
+ export declare const __test: {
149
+ __setCallOpenAiChatImplForTest: typeof __setCallOpenAiChatImplForTest;
150
+ __setApiRequestImplForTest: typeof __setApiRequestImplForTest;
151
+ __setEvaluateDiffImplForTest: typeof __setEvaluateDiffImplForTest;
152
+ runLlmDeepReview: typeof runLlmDeepReview;
153
+ __evaluateDiffForTest: typeof evaluateDiff;
154
+ };
105
155
  export {};