@kevinrabun/judges 3.118.0 โ†’ 3.121.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +31 -1
  2. package/dist/api.d.ts +3 -2
  3. package/dist/api.js +4 -2
  4. package/dist/cli-dispatch.d.ts +7 -0
  5. package/dist/cli-dispatch.js +654 -0
  6. package/dist/cli-formatters.d.ts +6 -0
  7. package/dist/cli-formatters.js +186 -0
  8. package/dist/cli.js +69 -4159
  9. package/dist/commands/baseline.js +2 -42
  10. package/dist/commands/coverage.js +3 -39
  11. package/dist/commands/diff.js +2 -38
  12. package/dist/commands/fix-pr.js +2 -23
  13. package/dist/commands/fix.js +3 -27
  14. package/dist/commands/llm-benchmark.d.ts +7 -0
  15. package/dist/commands/llm-benchmark.js +27 -1
  16. package/dist/commands/quality-gate.js +1 -12
  17. package/dist/commands/review-parallel.js +1 -19
  18. package/dist/commands/review.js +2 -33
  19. package/dist/commands/rule-test.js +1 -15
  20. package/dist/commands/tune.js +2 -29
  21. package/dist/commands/watch.js +3 -42
  22. package/dist/evaluators/hallucination-detection.js +343 -0
  23. package/dist/evaluators/index.d.ts +14 -11
  24. package/dist/evaluators/index.js +4 -182
  25. package/dist/evaluators/security.js +226 -2
  26. package/dist/evaluators/suppressions.d.ts +49 -0
  27. package/dist/evaluators/suppressions.js +185 -0
  28. package/dist/ext-to-lang.d.ts +16 -0
  29. package/dist/ext-to-lang.js +60 -0
  30. package/dist/github-app.d.ts +1 -3
  31. package/dist/github-app.js +2 -34
  32. package/dist/parallel.js +2 -14
  33. package/dist/tools/deep-review.d.ts +13 -2
  34. package/dist/tools/deep-review.js +76 -16
  35. package/dist/tools/register-evaluation.js +2 -29
  36. package/dist/tools/register-review.js +17 -3
  37. package/dist/tools/register-workflow.js +7 -1
  38. package/package.json +1 -1
  39. package/server.json +2 -2
package/README.md CHANGED
@@ -15,7 +15,7 @@ An MCP (Model Context Protocol) server that provides a panel of **45 specialized
15
15
  [![npm](https://img.shields.io/npm/v/@kevinrabun/judges)](https://www.npmjs.com/package/@kevinrabun/judges)
16
16
  [![npm downloads](https://img.shields.io/npm/dw/@kevinrabun/judges)](https://www.npmjs.com/package/@kevinrabun/judges)
17
17
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
18
- [![Tests](https://img.shields.io/badge/tests-2412-brightgreen)](https://github.com/KevinRabun/judges/actions)
18
+ [![Tests](https://img.shields.io/badge/tests-2481-brightgreen)](https://github.com/KevinRabun/judges/actions)
19
19
 
20
20
  > ๐Ÿ”ฐ **Packages**
21
21
  > - **CLI**: `@kevinrabun/judges-cli` โ†’ binary `judges` (use `npx @kevinrabun/judges-cli eval --file app.ts`).
@@ -1098,6 +1098,36 @@ Analyze a dependency manifest file for supply-chain risks, version pinning issue
1098
1098
  | `manifestType` | string | yes | File type: `package.json`, `requirements.txt`, etc. |
1099
1099
  | `context` | string | no | Optional context |
1100
1100
 
1101
+ ### `evaluate_git_diff`
1102
+ Evaluate only **changed lines** from a git diff. Provide either `repoPath` for a live git diff or `diffText` for a pre-computed unified diff.
1103
+
1104
+ | Parameter | Type | Required | Description |
1105
+ |-----------|------|----------|-------------|
1106
+ | `repoPath` | string | conditional | Absolute path to the git repository |
1107
+ | `base` | string | no | Git ref to diff against (default: `HEAD~1`) |
1108
+ | `diffText` | string | conditional | Pre-computed unified diff text |
1109
+ | `confidenceFilter` | number | no | Minimum confidence threshold for findings (0โ€“1) |
1110
+ | `autoTune` | boolean | no | Apply feedback-driven auto-tuning (default: false) |
1111
+ | `maxPromptChars` | number | no | Max character budget for LLM prompts (default: 100000, 0 = unlimited) |
1112
+ | `config` | object | no | Inline configuration |
1113
+
1114
+ ### `re_evaluate_with_context`
1115
+ Re-run the tribunal with **prior findings as context** for iterative refinement. Supports dispute resolution, developer context injection, and focus-area filtering.
1116
+
1117
+ | Parameter | Type | Required | Description |
1118
+ |-----------|------|----------|-------------|
1119
+ | `code` | string | yes | Source code to re-evaluate |
1120
+ | `language` | string | yes | Programming language |
1121
+ | `disputedRuleIds` | string[] | no | Rule IDs the developer disputes as false positives |
1122
+ | `acceptedRuleIds` | string[] | no | Rule IDs the developer accepts |
1123
+ | `developerContext` | string | no | Free-form explanation of developer intent |
1124
+ | `focusAreas` | string[] | no | Specific areas to focus on (e.g., `["security"]`) |
1125
+ | `confidenceFilter` | number | no | Minimum confidence threshold (default: 0.5) |
1126
+ | `filePath` | string | no | File path for context-aware evaluation |
1127
+ | `deepReview` | boolean | no | Include LLM deep-review prompt section |
1128
+ | `relatedFiles` | array | no | Cross-file context `{ path, snippet, relationship? }[]` |
1129
+ | `maxPromptChars` | number | no | Max character budget for LLM prompts (default: 100000, 0 = unlimited) |
1130
+
1101
1131
  #### Judge IDs
1102
1132
 
1103
1133
  `data-security` ยท `cybersecurity` ยท `cost-effectiveness` ยท `scalability` ยท `cloud-readiness` ยท `software-practices` ยท `accessibility` ยท `api-design` ยท `reliability` ยท `observability` ยท `performance` ยท `compliance` ยท `data-sovereignty` ยท `testing` ยท `documentation` ยท `internationalization` ยท `dependency-health` ยท `concurrency` ยท `ethics-bias` ยท `maintainability` ยท `error-handling` ยท `authentication` ยท `database` ยท `caching` ยท `configuration-management` ยท `backwards-compatibility` ยท `portability` ยท `ux` ยท `logging-privacy` ยท `rate-limiting` ยท `ci-cd` ยท `code-structure` ยท `agent-instructions` ยท `ai-code-safety` ยท `framework-safety` ยท `iac-security` ยท `false-positive-review`
package/dist/api.d.ts CHANGED
@@ -11,6 +11,7 @@
11
11
  export type { Severity, Verdict, Finding, Patch, LangFamily, JudgesConfig, RuleOverride, ProjectFile, ProjectVerdict, DiffVerdict, DependencyEntry, DependencyVerdict, JudgeEvaluation, TribunalVerdict, JudgeDefinition, EvaluationContextV2, EvidenceBundleV2, SpecializedFindingV2, TribunalVerdictV2, MustFixGateOptions, MustFixGateResult, AppBuilderWorkflowResult, PlainLanguageFinding, WorkflowTask, PolicyProfile, SuppressionRecord, SuppressionResult, ExecutionTrace, RuleTrace, StreamingBatch, JudgeSelectionContext, JudgeSelectionResult, SessionContext, } from "./types.js";
12
12
  export { JudgesError, ConfigError, EvaluationError, ParseError } from "./errors.js";
13
13
  export { parseConfig, defaultConfig, mergeConfigs, discoverCascadingConfigs, loadCascadingConfig, loadConfigFile, expandEnvPlaceholders, loadPluginJudges, validatePluginSpecifiers, isValidJudgeDefinition, validateJudgeDefinition, applyOverridesForFile, applyLanguageProfile, resolveExtendsConfig, } from "./config.js";
14
+ export { EXT_TO_LANG, SUPPORTED_EXTENSIONS, detectLanguageFromPath } from "./ext-to-lang.js";
14
15
  export { JUDGES, getJudge, getJudgeSummaries } from "./judges/index.js";
15
16
  export { evaluateWithJudge, evaluateWithTribunal, evaluateWithTribunalStreaming, evaluateProject, evaluateDiff, analyzeDependencies, enrichWithPatches, crossEvaluatorDedup, crossFileDedup, diffFindings, formatFindingDiff, evaluateNetChangeGate, applyInlineSuppressions, applyInlineSuppressionsWithAudit, runAppBuilderWorkflow, formatVerdictAsMarkdown, formatEvaluationAsMarkdown, clearEvaluationCaches, scanProjectWideSecurityPatterns, } from "./evaluators/index.js";
16
17
  export type { FindingDiff, NetChangeGateOptions, NetChangeGateResult, EvaluationOptions } from "./evaluators/index.js";
@@ -20,7 +21,7 @@ export { getPreset, composePresets, listPresets, PRESETS } from "./presets.js";
20
21
  export type { Preset } from "./presets.js";
21
22
  export { evaluateCodeV2, evaluateProjectV2, getSupportedPolicyProfiles } from "./evaluators/v2.js";
22
23
  export { analyzeCrossFileTaint } from "./ast/cross-file-taint.js";
23
- export { buildSingleJudgeDeepReviewSection, buildTribunalDeepReviewSection, buildSimplifiedDeepReviewSection, isContentPolicyRefusal, DEEP_REVIEW_PROMPT_INTRO, DEEP_REVIEW_IDENTITY, } from "./tools/deep-review.js";
24
+ export { buildSingleJudgeDeepReviewSection, buildTribunalDeepReviewSection, buildSimplifiedDeepReviewSection, formatRelatedFilesSection, isContentPolicyRefusal, DEEP_REVIEW_PROMPT_INTRO, DEEP_REVIEW_IDENTITY, DEFAULT_MAX_PROMPT_CHARS, } from "./tools/deep-review.js";
24
25
  export type { RelatedFileSnippet } from "./tools/deep-review.js";
25
26
  export { getCondensedCriteria } from "./tools/prompts.js";
26
27
  export { parseDismissedFindings, recordL2Feedback, loadFeedbackStore, saveFeedbackStore, addFeedback, computeFeedbackStats, getFpRateByRule, mergeFeedbackStores, computeTeamFeedbackStats, formatTeamStatsOutput, } from "./commands/feedback.js";
@@ -69,7 +70,7 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
69
70
  export type { ToolProfile, ToolCapability, ComparisonResult } from "./comparison.js";
70
71
  export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
71
72
  export type { BenchmarkCase, BenchmarkResult, BenchmarkGateOptions, BenchmarkGateResult, L2CoverageAnalysis, L2JudgeCoverage, L2CategoryCoverage, } from "./commands/benchmark.js";
72
- export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, } from "./commands/llm-benchmark.js";
73
+ export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
73
74
  export type { LlmBenchmarkSnapshot, LlmCaseResult } from "./commands/llm-benchmark.js";
74
75
  export type { LlmFinding, ValidationResult } from "./probabilistic/llm-response-validator.js";
75
76
  export { optimizeBenchmark, formatAmendmentSection, createEmptyStore, mergeAmendments, } from "./commands/llm-benchmark-optimizer.js";
package/dist/api.js CHANGED
@@ -12,6 +12,8 @@
12
12
  export { JudgesError, ConfigError, EvaluationError, ParseError } from "./errors.js";
13
13
  // โ”€โ”€โ”€ Config โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
14
14
  export { parseConfig, defaultConfig, mergeConfigs, discoverCascadingConfigs, loadCascadingConfig, loadConfigFile, expandEnvPlaceholders, loadPluginJudges, validatePluginSpecifiers, isValidJudgeDefinition, validateJudgeDefinition, applyOverridesForFile, applyLanguageProfile, resolveExtendsConfig, } from "./config.js";
15
+ // โ”€โ”€โ”€ Language Detection โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
16
+ export { EXT_TO_LANG, SUPPORTED_EXTENSIONS, detectLanguageFromPath } from "./ext-to-lang.js";
15
17
  // โ”€โ”€โ”€ Judge Registry โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
16
18
  export { JUDGES, getJudge, getJudgeSummaries } from "./judges/index.js";
17
19
  // โ”€โ”€โ”€ Core Evaluation Functions โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@@ -27,7 +29,7 @@ export { evaluateCodeV2, evaluateProjectV2, getSupportedPolicyProfiles } from ".
27
29
  // โ”€โ”€โ”€ Cross-File Taint Analysis โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
28
30
  export { analyzeCrossFileTaint } from "./ast/cross-file-taint.js";
29
31
  // โ”€โ”€โ”€ Deep Review Prompts โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
30
- export { buildSingleJudgeDeepReviewSection, buildTribunalDeepReviewSection, buildSimplifiedDeepReviewSection, isContentPolicyRefusal, DEEP_REVIEW_PROMPT_INTRO, DEEP_REVIEW_IDENTITY, } from "./tools/deep-review.js";
32
+ export { buildSingleJudgeDeepReviewSection, buildTribunalDeepReviewSection, buildSimplifiedDeepReviewSection, formatRelatedFilesSection, isContentPolicyRefusal, DEEP_REVIEW_PROMPT_INTRO, DEEP_REVIEW_IDENTITY, DEFAULT_MAX_PROMPT_CHARS, } from "./tools/deep-review.js";
31
33
  // โ”€โ”€โ”€ Prompt Utilities โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
32
34
  export { getCondensedCriteria } from "./tools/prompts.js";
33
35
  // โ”€โ”€โ”€ Feedback & Calibration โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@@ -78,7 +80,7 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
78
80
  // โ”€โ”€โ”€ Benchmark Gate โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
79
81
  export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
80
82
  // โ”€โ”€โ”€ LLM Benchmark โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
81
- export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, } from "./commands/llm-benchmark.js";
83
+ export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
82
84
  // โ”€โ”€โ”€ LLM Benchmark Optimizer (Self-Teaching) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
83
85
  export { optimizeBenchmark, formatAmendmentSection, createEmptyStore, mergeAmendments, } from "./commands/llm-benchmark-optimizer.js";
84
86
  // Review autopilot (GitHub App / scripts)
@@ -0,0 +1,7 @@
1
+ /**
2
+ * Command dispatch table โ€” maps CLI command names to their module path
3
+ * and exported handler function name. Each entry is lazily imported.
4
+ *
5
+ * Format: "command-name": ["./module-path.js", "exportedFunctionName"]
6
+ */
7
+ export declare const COMMAND_TABLE: Record<string, [string, string]>;