@kevinrabun/judges 3.115.4 → 3.117.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. package/agents/accessibility.judge.md +7 -0
  2. package/agents/agent-instructions.judge.md +7 -0
  3. package/agents/ai-code-safety.judge.md +7 -0
  4. package/agents/api-contract.judge.md +7 -0
  5. package/agents/api-design.judge.md +7 -0
  6. package/agents/authentication.judge.md +7 -0
  7. package/agents/backwards-compatibility.judge.md +7 -0
  8. package/agents/caching.judge.md +7 -0
  9. package/agents/ci-cd.judge.md +7 -0
  10. package/agents/cloud-readiness.judge.md +7 -0
  11. package/agents/concurrency.judge.md +7 -0
  12. package/agents/configuration-management.judge.md +7 -0
  13. package/agents/cybersecurity.judge.md +7 -0
  14. package/agents/data-security.judge.md +7 -0
  15. package/agents/dependency-health.judge.md +7 -0
  16. package/agents/documentation.judge.md +7 -0
  17. package/agents/error-handling.judge.md +7 -0
  18. package/agents/ethics-bias.judge.md +7 -0
  19. package/agents/false-positive-review.judge.md +12 -0
  20. package/agents/framework-safety.judge.md +7 -0
  21. package/agents/hallucination-detection.judge.md +13 -0
  22. package/agents/iac-security.judge.md +7 -0
  23. package/agents/intent-alignment.judge.md +13 -0
  24. package/agents/logging-privacy.judge.md +7 -0
  25. package/agents/maintainability.judge.md +7 -0
  26. package/agents/multi-turn-coherence.judge.md +7 -0
  27. package/agents/observability.judge.md +7 -0
  28. package/agents/portability.judge.md +7 -0
  29. package/agents/rate-limiting.judge.md +7 -0
  30. package/agents/reliability.judge.md +7 -0
  31. package/agents/security.judge.md +13 -0
  32. package/agents/testing.judge.md +7 -0
  33. package/agents/ux.judge.md +7 -0
  34. package/dist/a2a-protocol.d.ts +136 -0
  35. package/dist/a2a-protocol.js +218 -0
  36. package/dist/api.d.ts +21 -3
  37. package/dist/api.js +21 -1
  38. package/dist/audit-trail.d.ts +245 -0
  39. package/dist/audit-trail.js +257 -0
  40. package/dist/commands/benchmark-advanced.js +51 -51
  41. package/dist/commands/benchmark-ai-agents.js +16 -16
  42. package/dist/commands/benchmark-compliance-ethics.js +12 -12
  43. package/dist/commands/benchmark-expanded-2.js +2 -2
  44. package/dist/commands/benchmark-expanded.js +2 -2
  45. package/dist/commands/benchmark-infrastructure.js +12 -12
  46. package/dist/commands/benchmark-languages.js +11 -11
  47. package/dist/commands/benchmark-quality-ops.js +7 -7
  48. package/dist/commands/benchmark-security-deep.js +9 -9
  49. package/dist/commands/benchmark.js +1 -1
  50. package/dist/commands/llm-benchmark-optimizer.d.ts +78 -0
  51. package/dist/commands/llm-benchmark-optimizer.js +241 -0
  52. package/dist/commands/llm-benchmark.d.ts +4 -2
  53. package/dist/commands/llm-benchmark.js +40 -12
  54. package/dist/escalation.d.ts +100 -0
  55. package/dist/escalation.js +292 -0
  56. package/dist/evaluation-session.d.ts +74 -0
  57. package/dist/evaluation-session.js +152 -0
  58. package/dist/evaluators/index.d.ts +23 -1
  59. package/dist/evaluators/index.js +192 -3
  60. package/dist/evaluators/judge-selector.d.ts +19 -0
  61. package/dist/evaluators/judge-selector.js +141 -0
  62. package/dist/evaluators/recall-boost.d.ts +27 -0
  63. package/dist/evaluators/recall-boost.js +409 -0
  64. package/dist/feedback-loop.d.ts +62 -0
  65. package/dist/feedback-loop.js +179 -0
  66. package/dist/index.js +2 -0
  67. package/dist/judges/accessibility.js +7 -0
  68. package/dist/judges/agent-instructions.js +7 -0
  69. package/dist/judges/ai-code-safety.js +7 -0
  70. package/dist/judges/api-contract.js +7 -0
  71. package/dist/judges/api-design.js +7 -0
  72. package/dist/judges/authentication.js +7 -0
  73. package/dist/judges/backwards-compatibility.js +7 -0
  74. package/dist/judges/caching.js +7 -0
  75. package/dist/judges/ci-cd.js +7 -0
  76. package/dist/judges/cloud-readiness.js +7 -0
  77. package/dist/judges/concurrency.js +7 -0
  78. package/dist/judges/configuration-management.js +7 -0
  79. package/dist/judges/cybersecurity.js +7 -0
  80. package/dist/judges/data-security.js +7 -0
  81. package/dist/judges/dependency-health.js +7 -0
  82. package/dist/judges/documentation.js +7 -0
  83. package/dist/judges/error-handling.js +7 -0
  84. package/dist/judges/ethics-bias.js +7 -0
  85. package/dist/judges/false-positive-review.js +13 -1
  86. package/dist/judges/framework-safety.js +7 -0
  87. package/dist/judges/hallucination-detection.js +14 -1
  88. package/dist/judges/iac-security.js +7 -0
  89. package/dist/judges/intent-alignment.js +14 -1
  90. package/dist/judges/logging-privacy.js +7 -0
  91. package/dist/judges/maintainability.js +7 -0
  92. package/dist/judges/multi-turn-coherence.js +7 -0
  93. package/dist/judges/observability.js +7 -0
  94. package/dist/judges/portability.js +7 -0
  95. package/dist/judges/rate-limiting.js +7 -0
  96. package/dist/judges/reliability.js +7 -0
  97. package/dist/judges/security.js +14 -1
  98. package/dist/judges/testing.js +7 -0
  99. package/dist/judges/ux.js +7 -0
  100. package/dist/review-conversation.d.ts +87 -0
  101. package/dist/review-conversation.js +307 -0
  102. package/dist/sast-integration.d.ts +112 -0
  103. package/dist/sast-integration.js +215 -0
  104. package/dist/tools/register-evaluation.js +208 -8
  105. package/dist/tools/register-fix.js +24 -1
  106. package/dist/tools/register-resources.d.ts +6 -0
  107. package/dist/tools/register-resources.js +177 -0
  108. package/dist/tools/register-review.js +26 -1
  109. package/dist/tools/register-workflow.js +384 -11
  110. package/dist/tools/validation.d.ts +13 -0
  111. package/dist/tools/validation.js +77 -0
  112. package/dist/types.d.ts +122 -0
  113. package/package.json +25 -12
  114. package/server.json +2 -2
package/dist/types.d.ts CHANGED
@@ -752,3 +752,125 @@ export interface JudgeDefinition {
752
752
  */
753
753
  analyze?: (code: string, language: string, context?: AnalyzeContext) => Finding[];
754
754
  }
755
+ /**
756
+ * Detailed trace of a single rule application within a judge evaluation.
757
+ * Enables "why did judge X flag line Y?" debugging and FP investigation.
758
+ */
759
+ export interface RuleTrace {
760
+ /** Rule ID (e.g. "SEC-001") */
761
+ ruleId: string;
762
+ /** Whether the rule produced any findings */
763
+ matched: boolean;
764
+ /** Number of findings produced */
765
+ findingCount: number;
766
+ /** Confidence of the highest-confidence finding from this rule, if any */
767
+ peakConfidence?: number;
768
+ /** Human-readable reason for match/skip */
769
+ reason?: string;
770
+ }
771
+ /**
772
+ * Full execution trace for a single judge evaluation — captures timing,
773
+ * rules matched, AST resolution details, and skip reasons.
774
+ */
775
+ export interface ExecutionTrace {
776
+ /** Judge that was evaluated */
777
+ judgeId: string;
778
+ judgeName: string;
779
+ /** Wall-clock duration in milliseconds */
780
+ durationMs: number;
781
+ /** Whether the judge was skipped (and why) */
782
+ skipped?: boolean;
783
+ skipReason?: string;
784
+ /** Per-rule traces */
785
+ rules: RuleTrace[];
786
+ /** AST resolution metadata (when AST analysis was used) */
787
+ astResolution?: {
788
+ functionsAnalyzed: number;
789
+ maxComplexity: number;
790
+ taintFlowsDetected: number;
791
+ };
792
+ /** Number of findings before and after post-processing */
793
+ rawFindingCount: number;
794
+ finalFindingCount: number;
795
+ }
796
+ /**
797
+ * A single batch yielded during streaming evaluation — one per judge.
798
+ * Enables progressive result display and early termination.
799
+ */
800
+ export interface StreamingBatch {
801
+ /** The judge that produced this batch */
802
+ judgeId: string;
803
+ judgeName: string;
804
+ /** The evaluation result for this judge */
805
+ evaluation: JudgeEvaluation;
806
+ /** Execution trace for this judge */
807
+ trace: ExecutionTrace;
808
+ /** Running aggregate across all completed judges so far */
809
+ aggregate: {
810
+ completedJudges: number;
811
+ totalJudges: number;
812
+ findingsSoFar: number;
813
+ criticalSoFar: number;
814
+ highSoFar: number;
815
+ currentScore: number;
816
+ currentVerdict: Verdict;
817
+ };
818
+ /** Whether this is the final batch (all judges complete) */
819
+ done: boolean;
820
+ }
821
+ /**
822
+ * Signal-based context used to select which judges are relevant for a file.
823
+ */
824
+ export interface JudgeSelectionContext {
825
+ /** Programming language */
826
+ language: string;
827
+ /** Detected frameworks (e.g. "express", "react", "django") */
828
+ frameworks?: string[];
829
+ /** File classification from shared.ts (e.g. "test", "config", "server") */
830
+ fileCategory?: string;
831
+ /** File path (for glob-based overrides) */
832
+ filePath?: string;
833
+ /** Whether this is a project-level evaluation */
834
+ projectMode?: boolean;
835
+ }
836
+ /**
837
+ * Result of judge selection — which judges to run and why others were skipped.
838
+ */
839
+ export interface JudgeSelectionResult {
840
+ /** Judges selected to run */
841
+ selected: JudgeDefinition[];
842
+ /** Judges skipped with reasons */
843
+ skipped: Array<{
844
+ judgeId: string;
845
+ reason: string;
846
+ }>;
847
+ }
848
+ /**
849
+ * Persistent project context that survives across evaluation calls within
850
+ * the same session. Avoids redundant framework detection, capability scanning,
851
+ * and feedback loading for repeated evaluations of the same project.
852
+ */
853
+ export interface SessionContext {
854
+ /** Detected frameworks across the project */
855
+ frameworks: string[];
856
+ /** Detected project-wide security capabilities */
857
+ capabilities: Set<string>;
858
+ /** Per-file verdict history — tracks how findings evolve */
859
+ verdictHistory: Map<string, {
860
+ score: number;
861
+ findingCount: number;
862
+ timestamp: string;
863
+ }[]>;
864
+ /** Files that have been evaluated (content hash → file path) */
865
+ evaluatedFiles: Map<string, string>;
866
+ /** Session start time */
867
+ startedAt: string;
868
+ /** Number of evaluations performed in this session */
869
+ evaluationCount: number;
870
+ /** User feedback on findings — ruleId → { tp, fp, wontfix counts } */
871
+ feedbackTally: Map<string, {
872
+ tp: number;
873
+ fp: number;
874
+ wontfix: number;
875
+ }>;
876
+ }
package/package.json CHANGED
@@ -1,57 +1,69 @@
1
1
  {
2
2
  "name": "@kevinrabun/judges",
3
- "version": "3.115.4",
3
+ "version": "3.117.0",
4
4
  "description": "45 specialized judges that evaluate AI-generated code for security, cost, and quality.",
5
5
  "mcpName": "io.github.KevinRabun/judges",
6
6
  "type": "module",
7
7
  "main": "dist/index.js",
8
+ "types": "dist/api.d.ts",
8
9
  "bin": {
9
10
  "judges": "packages/judges-cli/bin/judges.js"
10
11
  },
11
12
  "exports": {
12
13
  ".": {
14
+ "types": "./dist/api.d.ts",
13
15
  "import": "./dist/api.js",
14
- "types": "./dist/api.d.ts"
16
+ "default": "./dist/api.js"
15
17
  },
16
18
  "./api": {
19
+ "types": "./dist/api.d.ts",
17
20
  "import": "./dist/api.js",
18
- "types": "./dist/api.d.ts"
21
+ "default": "./dist/api.js"
19
22
  },
20
23
  "./server": {
24
+ "types": "./dist/index.d.ts",
21
25
  "import": "./dist/index.js",
22
- "types": "./dist/index.d.ts"
26
+ "default": "./dist/index.js"
23
27
  },
24
28
  "./sarif": {
29
+ "types": "./dist/api.d.ts",
25
30
  "import": "./dist/api.js",
26
- "types": "./dist/api.d.ts"
31
+ "default": "./dist/api.js"
27
32
  },
28
33
  "./junit": {
34
+ "types": "./dist/api.d.ts",
29
35
  "import": "./dist/api.js",
30
- "types": "./dist/api.d.ts"
36
+ "default": "./dist/api.js"
31
37
  },
32
38
  "./codeclimate": {
39
+ "types": "./dist/api.d.ts",
33
40
  "import": "./dist/api.js",
34
- "types": "./dist/api.d.ts"
41
+ "default": "./dist/api.js"
35
42
  },
36
43
  "./badge": {
44
+ "types": "./dist/api.d.ts",
37
45
  "import": "./dist/api.js",
38
- "types": "./dist/api.d.ts"
46
+ "default": "./dist/api.js"
39
47
  },
40
48
  "./diagnostics": {
49
+ "types": "./dist/api.d.ts",
41
50
  "import": "./dist/api.js",
42
- "types": "./dist/api.d.ts"
51
+ "default": "./dist/api.js"
43
52
  },
44
53
  "./plugins": {
54
+ "types": "./dist/api.d.ts",
45
55
  "import": "./dist/api.js",
46
- "types": "./dist/api.d.ts"
56
+ "default": "./dist/api.js"
47
57
  },
48
58
  "./fingerprint": {
59
+ "types": "./dist/api.d.ts",
49
60
  "import": "./dist/api.js",
50
- "types": "./dist/api.d.ts"
61
+ "default": "./dist/api.js"
51
62
  },
52
63
  "./comparison": {
64
+ "types": "./dist/api.d.ts",
53
65
  "import": "./dist/api.js",
54
- "types": "./dist/api.d.ts"
66
+ "default": "./dist/api.js"
55
67
  }
56
68
  },
57
69
  "files": [
@@ -133,6 +145,7 @@
133
145
  "zod": "^4.3.6"
134
146
  },
135
147
  "devDependencies": {
148
+ "@anthropic-ai/sdk": "^0.79.0",
136
149
  "@eslint/js": "^10.0.1",
137
150
  "@types/node": "^25.3.0",
138
151
  "@typescript-eslint/eslint-plugin": "^8.56.1",
package/server.json CHANGED
@@ -7,12 +7,12 @@
7
7
  "url": "https://github.com/kevinrabun/judges",
8
8
  "source": "github"
9
9
  },
10
- "version": "3.115.4",
10
+ "version": "3.117.0",
11
11
  "packages": [
12
12
  {
13
13
  "registryType": "npm",
14
14
  "identifier": "@kevinrabun/judges",
15
- "version": "3.115.4",
15
+ "version": "3.117.0",
16
16
  "transport": {
17
17
  "type": "stdio"
18
18
  }