@kevinrabun/judges 3.118.0 โ 3.121.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -1
- package/dist/api.d.ts +3 -2
- package/dist/api.js +4 -2
- package/dist/cli-dispatch.d.ts +7 -0
- package/dist/cli-dispatch.js +654 -0
- package/dist/cli-formatters.d.ts +6 -0
- package/dist/cli-formatters.js +186 -0
- package/dist/cli.js +69 -4159
- package/dist/commands/baseline.js +2 -42
- package/dist/commands/coverage.js +3 -39
- package/dist/commands/diff.js +2 -38
- package/dist/commands/fix-pr.js +2 -23
- package/dist/commands/fix.js +3 -27
- package/dist/commands/llm-benchmark.d.ts +7 -0
- package/dist/commands/llm-benchmark.js +27 -1
- package/dist/commands/quality-gate.js +1 -12
- package/dist/commands/review-parallel.js +1 -19
- package/dist/commands/review.js +2 -33
- package/dist/commands/rule-test.js +1 -15
- package/dist/commands/tune.js +2 -29
- package/dist/commands/watch.js +3 -42
- package/dist/evaluators/hallucination-detection.js +343 -0
- package/dist/evaluators/index.d.ts +14 -11
- package/dist/evaluators/index.js +4 -182
- package/dist/evaluators/security.js +226 -2
- package/dist/evaluators/suppressions.d.ts +49 -0
- package/dist/evaluators/suppressions.js +185 -0
- package/dist/ext-to-lang.d.ts +16 -0
- package/dist/ext-to-lang.js +60 -0
- package/dist/github-app.d.ts +1 -3
- package/dist/github-app.js +2 -34
- package/dist/parallel.js +2 -14
- package/dist/tools/deep-review.d.ts +13 -2
- package/dist/tools/deep-review.js +76 -16
- package/dist/tools/register-evaluation.js +2 -29
- package/dist/tools/register-review.js +17 -3
- package/dist/tools/register-workflow.js +7 -1
- package/package.json +1 -1
- package/server.json +2 -2
package/README.md
CHANGED
|
@@ -15,7 +15,7 @@ An MCP (Model Context Protocol) server that provides a panel of **45 specialized
|
|
|
15
15
|
[](https://www.npmjs.com/package/@kevinrabun/judges)
|
|
16
16
|
[](https://www.npmjs.com/package/@kevinrabun/judges)
|
|
17
17
|
[](https://opensource.org/licenses/MIT)
|
|
18
|
-
[](https://github.com/KevinRabun/judges/actions)
|
|
19
19
|
|
|
20
20
|
> ๐ฐ **Packages**
|
|
21
21
|
> - **CLI**: `@kevinrabun/judges-cli` โ binary `judges` (use `npx @kevinrabun/judges-cli eval --file app.ts`).
|
|
@@ -1098,6 +1098,36 @@ Analyze a dependency manifest file for supply-chain risks, version pinning issue
|
|
|
1098
1098
|
| `manifestType` | string | yes | File type: `package.json`, `requirements.txt`, etc. |
|
|
1099
1099
|
| `context` | string | no | Optional context |
|
|
1100
1100
|
|
|
1101
|
+
### `evaluate_git_diff`
|
|
1102
|
+
Evaluate only **changed lines** from a git diff. Provide either `repoPath` for a live git diff or `diffText` for a pre-computed unified diff.
|
|
1103
|
+
|
|
1104
|
+
| Parameter | Type | Required | Description |
|
|
1105
|
+
|-----------|------|----------|-------------|
|
|
1106
|
+
| `repoPath` | string | conditional | Absolute path to the git repository |
|
|
1107
|
+
| `base` | string | no | Git ref to diff against (default: `HEAD~1`) |
|
|
1108
|
+
| `diffText` | string | conditional | Pre-computed unified diff text |
|
|
1109
|
+
| `confidenceFilter` | number | no | Minimum confidence threshold for findings (0โ1) |
|
|
1110
|
+
| `autoTune` | boolean | no | Apply feedback-driven auto-tuning (default: false) |
|
|
1111
|
+
| `maxPromptChars` | number | no | Max character budget for LLM prompts (default: 100000, 0 = unlimited) |
|
|
1112
|
+
| `config` | object | no | Inline configuration |
|
|
1113
|
+
|
|
1114
|
+
### `re_evaluate_with_context`
|
|
1115
|
+
Re-run the tribunal with **prior findings as context** for iterative refinement. Supports dispute resolution, developer context injection, and focus-area filtering.
|
|
1116
|
+
|
|
1117
|
+
| Parameter | Type | Required | Description |
|
|
1118
|
+
|-----------|------|----------|-------------|
|
|
1119
|
+
| `code` | string | yes | Source code to re-evaluate |
|
|
1120
|
+
| `language` | string | yes | Programming language |
|
|
1121
|
+
| `disputedRuleIds` | string[] | no | Rule IDs the developer disputes as false positives |
|
|
1122
|
+
| `acceptedRuleIds` | string[] | no | Rule IDs the developer accepts |
|
|
1123
|
+
| `developerContext` | string | no | Free-form explanation of developer intent |
|
|
1124
|
+
| `focusAreas` | string[] | no | Specific areas to focus on (e.g., `["security"]`) |
|
|
1125
|
+
| `confidenceFilter` | number | no | Minimum confidence threshold (default: 0.5) |
|
|
1126
|
+
| `filePath` | string | no | File path for context-aware evaluation |
|
|
1127
|
+
| `deepReview` | boolean | no | Include LLM deep-review prompt section |
|
|
1128
|
+
| `relatedFiles` | array | no | Cross-file context `{ path, snippet, relationship? }[]` |
|
|
1129
|
+
| `maxPromptChars` | number | no | Max character budget for LLM prompts (default: 100000, 0 = unlimited) |
|
|
1130
|
+
|
|
1101
1131
|
#### Judge IDs
|
|
1102
1132
|
|
|
1103
1133
|
`data-security` ยท `cybersecurity` ยท `cost-effectiveness` ยท `scalability` ยท `cloud-readiness` ยท `software-practices` ยท `accessibility` ยท `api-design` ยท `reliability` ยท `observability` ยท `performance` ยท `compliance` ยท `data-sovereignty` ยท `testing` ยท `documentation` ยท `internationalization` ยท `dependency-health` ยท `concurrency` ยท `ethics-bias` ยท `maintainability` ยท `error-handling` ยท `authentication` ยท `database` ยท `caching` ยท `configuration-management` ยท `backwards-compatibility` ยท `portability` ยท `ux` ยท `logging-privacy` ยท `rate-limiting` ยท `ci-cd` ยท `code-structure` ยท `agent-instructions` ยท `ai-code-safety` ยท `framework-safety` ยท `iac-security` ยท `false-positive-review`
|
package/dist/api.d.ts
CHANGED
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
export type { Severity, Verdict, Finding, Patch, LangFamily, JudgesConfig, RuleOverride, ProjectFile, ProjectVerdict, DiffVerdict, DependencyEntry, DependencyVerdict, JudgeEvaluation, TribunalVerdict, JudgeDefinition, EvaluationContextV2, EvidenceBundleV2, SpecializedFindingV2, TribunalVerdictV2, MustFixGateOptions, MustFixGateResult, AppBuilderWorkflowResult, PlainLanguageFinding, WorkflowTask, PolicyProfile, SuppressionRecord, SuppressionResult, ExecutionTrace, RuleTrace, StreamingBatch, JudgeSelectionContext, JudgeSelectionResult, SessionContext, } from "./types.js";
|
|
12
12
|
export { JudgesError, ConfigError, EvaluationError, ParseError } from "./errors.js";
|
|
13
13
|
export { parseConfig, defaultConfig, mergeConfigs, discoverCascadingConfigs, loadCascadingConfig, loadConfigFile, expandEnvPlaceholders, loadPluginJudges, validatePluginSpecifiers, isValidJudgeDefinition, validateJudgeDefinition, applyOverridesForFile, applyLanguageProfile, resolveExtendsConfig, } from "./config.js";
|
|
14
|
+
export { EXT_TO_LANG, SUPPORTED_EXTENSIONS, detectLanguageFromPath } from "./ext-to-lang.js";
|
|
14
15
|
export { JUDGES, getJudge, getJudgeSummaries } from "./judges/index.js";
|
|
15
16
|
export { evaluateWithJudge, evaluateWithTribunal, evaluateWithTribunalStreaming, evaluateProject, evaluateDiff, analyzeDependencies, enrichWithPatches, crossEvaluatorDedup, crossFileDedup, diffFindings, formatFindingDiff, evaluateNetChangeGate, applyInlineSuppressions, applyInlineSuppressionsWithAudit, runAppBuilderWorkflow, formatVerdictAsMarkdown, formatEvaluationAsMarkdown, clearEvaluationCaches, scanProjectWideSecurityPatterns, } from "./evaluators/index.js";
|
|
16
17
|
export type { FindingDiff, NetChangeGateOptions, NetChangeGateResult, EvaluationOptions } from "./evaluators/index.js";
|
|
@@ -20,7 +21,7 @@ export { getPreset, composePresets, listPresets, PRESETS } from "./presets.js";
|
|
|
20
21
|
export type { Preset } from "./presets.js";
|
|
21
22
|
export { evaluateCodeV2, evaluateProjectV2, getSupportedPolicyProfiles } from "./evaluators/v2.js";
|
|
22
23
|
export { analyzeCrossFileTaint } from "./ast/cross-file-taint.js";
|
|
23
|
-
export { buildSingleJudgeDeepReviewSection, buildTribunalDeepReviewSection, buildSimplifiedDeepReviewSection, isContentPolicyRefusal, DEEP_REVIEW_PROMPT_INTRO, DEEP_REVIEW_IDENTITY, } from "./tools/deep-review.js";
|
|
24
|
+
export { buildSingleJudgeDeepReviewSection, buildTribunalDeepReviewSection, buildSimplifiedDeepReviewSection, formatRelatedFilesSection, isContentPolicyRefusal, DEEP_REVIEW_PROMPT_INTRO, DEEP_REVIEW_IDENTITY, DEFAULT_MAX_PROMPT_CHARS, } from "./tools/deep-review.js";
|
|
24
25
|
export type { RelatedFileSnippet } from "./tools/deep-review.js";
|
|
25
26
|
export { getCondensedCriteria } from "./tools/prompts.js";
|
|
26
27
|
export { parseDismissedFindings, recordL2Feedback, loadFeedbackStore, saveFeedbackStore, addFeedback, computeFeedbackStats, getFpRateByRule, mergeFeedbackStores, computeTeamFeedbackStats, formatTeamStatsOutput, } from "./commands/feedback.js";
|
|
@@ -69,7 +70,7 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
|
|
|
69
70
|
export type { ToolProfile, ToolCapability, ComparisonResult } from "./comparison.js";
|
|
70
71
|
export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
|
|
71
72
|
export type { BenchmarkCase, BenchmarkResult, BenchmarkGateOptions, BenchmarkGateResult, L2CoverageAnalysis, L2JudgeCoverage, L2CategoryCoverage, } from "./commands/benchmark.js";
|
|
72
|
-
export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, } from "./commands/llm-benchmark.js";
|
|
73
|
+
export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
|
|
73
74
|
export type { LlmBenchmarkSnapshot, LlmCaseResult } from "./commands/llm-benchmark.js";
|
|
74
75
|
export type { LlmFinding, ValidationResult } from "./probabilistic/llm-response-validator.js";
|
|
75
76
|
export { optimizeBenchmark, formatAmendmentSection, createEmptyStore, mergeAmendments, } from "./commands/llm-benchmark-optimizer.js";
|
package/dist/api.js
CHANGED
|
@@ -12,6 +12,8 @@
|
|
|
12
12
|
export { JudgesError, ConfigError, EvaluationError, ParseError } from "./errors.js";
|
|
13
13
|
// โโโ Config โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
14
14
|
export { parseConfig, defaultConfig, mergeConfigs, discoverCascadingConfigs, loadCascadingConfig, loadConfigFile, expandEnvPlaceholders, loadPluginJudges, validatePluginSpecifiers, isValidJudgeDefinition, validateJudgeDefinition, applyOverridesForFile, applyLanguageProfile, resolveExtendsConfig, } from "./config.js";
|
|
15
|
+
// โโโ Language Detection โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
16
|
+
export { EXT_TO_LANG, SUPPORTED_EXTENSIONS, detectLanguageFromPath } from "./ext-to-lang.js";
|
|
15
17
|
// โโโ Judge Registry โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
16
18
|
export { JUDGES, getJudge, getJudgeSummaries } from "./judges/index.js";
|
|
17
19
|
// โโโ Core Evaluation Functions โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
@@ -27,7 +29,7 @@ export { evaluateCodeV2, evaluateProjectV2, getSupportedPolicyProfiles } from ".
|
|
|
27
29
|
// โโโ Cross-File Taint Analysis โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
28
30
|
export { analyzeCrossFileTaint } from "./ast/cross-file-taint.js";
|
|
29
31
|
// โโโ Deep Review Prompts โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
30
|
-
export { buildSingleJudgeDeepReviewSection, buildTribunalDeepReviewSection, buildSimplifiedDeepReviewSection, isContentPolicyRefusal, DEEP_REVIEW_PROMPT_INTRO, DEEP_REVIEW_IDENTITY, } from "./tools/deep-review.js";
|
|
32
|
+
export { buildSingleJudgeDeepReviewSection, buildTribunalDeepReviewSection, buildSimplifiedDeepReviewSection, formatRelatedFilesSection, isContentPolicyRefusal, DEEP_REVIEW_PROMPT_INTRO, DEEP_REVIEW_IDENTITY, DEFAULT_MAX_PROMPT_CHARS, } from "./tools/deep-review.js";
|
|
31
33
|
// โโโ Prompt Utilities โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
32
34
|
export { getCondensedCriteria } from "./tools/prompts.js";
|
|
33
35
|
// โโโ Feedback & Calibration โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
@@ -78,7 +80,7 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
|
|
|
78
80
|
// โโโ Benchmark Gate โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
79
81
|
export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
|
|
80
82
|
// โโโ LLM Benchmark โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
81
|
-
export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, } from "./commands/llm-benchmark.js";
|
|
83
|
+
export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
|
|
82
84
|
// โโโ LLM Benchmark Optimizer (Self-Teaching) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
83
85
|
export { optimizeBenchmark, formatAmendmentSection, createEmptyStore, mergeAmendments, } from "./commands/llm-benchmark-optimizer.js";
|
|
84
86
|
// Review autopilot (GitHub App / scripts)
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Command dispatch table โ maps CLI command names to their module path
|
|
3
|
+
* and exported handler function name. Each entry is lazily imported.
|
|
4
|
+
*
|
|
5
|
+
* Format: "command-name": ["./module-path.js", "exportedFunctionName"]
|
|
6
|
+
*/
|
|
7
|
+
export declare const COMMAND_TABLE: Record<string, [string, string]>;
|