npm - @kevinrabun/judges - Versions diffs - 3.115.4 → 3.117.0 - Mend

@kevinrabun/judges 3.115.4 → 3.117.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

package/agents/accessibility.judge.md +7 -0
package/agents/agent-instructions.judge.md +7 -0
package/agents/ai-code-safety.judge.md +7 -0
package/agents/api-contract.judge.md +7 -0
package/agents/api-design.judge.md +7 -0
package/agents/authentication.judge.md +7 -0
package/agents/backwards-compatibility.judge.md +7 -0
package/agents/caching.judge.md +7 -0
package/agents/ci-cd.judge.md +7 -0
package/agents/cloud-readiness.judge.md +7 -0
package/agents/concurrency.judge.md +7 -0
package/agents/configuration-management.judge.md +7 -0
package/agents/cybersecurity.judge.md +7 -0
package/agents/data-security.judge.md +7 -0
package/agents/dependency-health.judge.md +7 -0
package/agents/documentation.judge.md +7 -0
package/agents/error-handling.judge.md +7 -0
package/agents/ethics-bias.judge.md +7 -0
package/agents/false-positive-review.judge.md +12 -0
package/agents/framework-safety.judge.md +7 -0
package/agents/hallucination-detection.judge.md +13 -0
package/agents/iac-security.judge.md +7 -0
package/agents/intent-alignment.judge.md +13 -0
package/agents/logging-privacy.judge.md +7 -0
package/agents/maintainability.judge.md +7 -0
package/agents/multi-turn-coherence.judge.md +7 -0
package/agents/observability.judge.md +7 -0
package/agents/portability.judge.md +7 -0
package/agents/rate-limiting.judge.md +7 -0
package/agents/reliability.judge.md +7 -0
package/agents/security.judge.md +13 -0
package/agents/testing.judge.md +7 -0
package/agents/ux.judge.md +7 -0
package/dist/a2a-protocol.d.ts +136 -0
package/dist/a2a-protocol.js +218 -0
package/dist/api.d.ts +21 -3
package/dist/api.js +21 -1
package/dist/audit-trail.d.ts +245 -0
package/dist/audit-trail.js +257 -0
package/dist/commands/benchmark-advanced.js +51 -51
package/dist/commands/benchmark-ai-agents.js +16 -16
package/dist/commands/benchmark-compliance-ethics.js +12 -12
package/dist/commands/benchmark-expanded-2.js +2 -2
package/dist/commands/benchmark-expanded.js +2 -2
package/dist/commands/benchmark-infrastructure.js +12 -12
package/dist/commands/benchmark-languages.js +11 -11
package/dist/commands/benchmark-quality-ops.js +7 -7
package/dist/commands/benchmark-security-deep.js +9 -9
package/dist/commands/benchmark.js +1 -1
package/dist/commands/llm-benchmark-optimizer.d.ts +78 -0
package/dist/commands/llm-benchmark-optimizer.js +241 -0
package/dist/commands/llm-benchmark.d.ts +4 -2
package/dist/commands/llm-benchmark.js +40 -12
package/dist/escalation.d.ts +100 -0
package/dist/escalation.js +292 -0
package/dist/evaluation-session.d.ts +74 -0
package/dist/evaluation-session.js +152 -0
package/dist/evaluators/index.d.ts +23 -1
package/dist/evaluators/index.js +192 -3
package/dist/evaluators/judge-selector.d.ts +19 -0
package/dist/evaluators/judge-selector.js +141 -0
package/dist/evaluators/recall-boost.d.ts +27 -0
package/dist/evaluators/recall-boost.js +409 -0
package/dist/feedback-loop.d.ts +62 -0
package/dist/feedback-loop.js +179 -0
package/dist/index.js +2 -0
package/dist/judges/accessibility.js +7 -0
package/dist/judges/agent-instructions.js +7 -0
package/dist/judges/ai-code-safety.js +7 -0
package/dist/judges/api-contract.js +7 -0
package/dist/judges/api-design.js +7 -0
package/dist/judges/authentication.js +7 -0
package/dist/judges/backwards-compatibility.js +7 -0
package/dist/judges/caching.js +7 -0
package/dist/judges/ci-cd.js +7 -0
package/dist/judges/cloud-readiness.js +7 -0
package/dist/judges/concurrency.js +7 -0
package/dist/judges/configuration-management.js +7 -0
package/dist/judges/cybersecurity.js +7 -0
package/dist/judges/data-security.js +7 -0
package/dist/judges/dependency-health.js +7 -0
package/dist/judges/documentation.js +7 -0
package/dist/judges/error-handling.js +7 -0
package/dist/judges/ethics-bias.js +7 -0
package/dist/judges/false-positive-review.js +13 -1
package/dist/judges/framework-safety.js +7 -0
package/dist/judges/hallucination-detection.js +14 -1
package/dist/judges/iac-security.js +7 -0
package/dist/judges/intent-alignment.js +14 -1
package/dist/judges/logging-privacy.js +7 -0
package/dist/judges/maintainability.js +7 -0
package/dist/judges/multi-turn-coherence.js +7 -0
package/dist/judges/observability.js +7 -0
package/dist/judges/portability.js +7 -0
package/dist/judges/rate-limiting.js +7 -0
package/dist/judges/reliability.js +7 -0
package/dist/judges/security.js +14 -1
package/dist/judges/testing.js +7 -0
package/dist/judges/ux.js +7 -0
package/dist/review-conversation.d.ts +87 -0
package/dist/review-conversation.js +307 -0
package/dist/sast-integration.d.ts +112 -0
package/dist/sast-integration.js +215 -0
package/dist/tools/register-evaluation.js +208 -8
package/dist/tools/register-fix.js +24 -1
package/dist/tools/register-resources.d.ts +6 -0
package/dist/tools/register-resources.js +177 -0
package/dist/tools/register-review.js +26 -1
package/dist/tools/register-workflow.js +384 -11
package/dist/tools/validation.d.ts +13 -0
package/dist/tools/validation.js +77 -0
package/dist/types.d.ts +122 -0
package/package.json +25 -12
package/server.json +2 -2

package/dist/commands/benchmark-expanded-2.js CHANGED Viewed

@@ -2258,7 +2258,7 @@ jobs:
         run: |
           echo \${{ github.event.pull_request.title }}  # Script injection
           ./deploy.sh`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["CICD-001"],
         category: "ci-cd",
         difficulty: "medium",
     },
@@ -4200,7 +4200,7 @@ spec:
     await new Promise(resolve => setTimeout(resolve, 10));
   }
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["CONC-001"],
         category: "concurrency",
         difficulty: "hard",
     },

package/dist/commands/benchmark-expanded.js CHANGED Viewed

@@ -1644,7 +1644,7 @@ def predict_approval(model, applicant):
     features = [applicant['age'], applicant['income'], applicant['race'],
                 applicant['gender'], applicant['zip_code'], applicant['credit_score']]
     return model.predict([features])[0]`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["ETHICS-001"],
         category: "ethics-bias",
         difficulty: "medium",
     },
@@ -2309,7 +2309,7 @@ form = cgi.FieldStorage()
 username = form.getfirst("username")
 module = imp.load_source("config", "/etc/app/config.py")`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["FW-001"],
         category: "ai-code-safety",
         difficulty: "medium",
     },

package/dist/commands/benchmark-infrastructure.js CHANGED Viewed

@@ -259,7 +259,7 @@ const GCP_SERVICE_KEY = '{"type":"service_account","project_id":"my-project","pr
 };
 export default config;`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["CFG-001"],
         category: "configuration",
         difficulty: "easy",
     },
@@ -748,7 +748,7 @@ resource "aws_s3_bucket_versioning" "sensitive_versioning" {
     status = "Enabled"
   }
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["IAC-001"],
         category: "iac-security",
         difficulty: "medium",
     },
@@ -841,7 +841,7 @@ resource "aws_volume_attachment" "data_attach" {
   volume_id   = aws_ebs_volume.data_volume.id
   instance_id = aws_instance.app_server.id
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["IAC-001"],
         category: "iac-security",
         difficulty: "easy",
     },
@@ -1259,7 +1259,7 @@ jobs:
         env:
           DB_URL: \${{ secrets.DB_URL }}
         run: npm run test:integration`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["CICD-001"],
         category: "cicd",
         difficulty: "hard",
     },
@@ -1340,7 +1340,7 @@ resource "google_compute_firewall" "allow_all" {
   source_ranges = ["0.0.0.0/0"]
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["CLOUD-001"],
         category: "cloud",
         difficulty: "medium",
     },
@@ -1375,7 +1375,7 @@ resource "aws_db_instance" "mysql_prod" {
   deletion_protection     = false
   backup_retention_period = 1
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["CLOUD-001"],
         category: "cloud",
         difficulty: "hard",
     },
@@ -1406,7 +1406,7 @@ resource "aws_db_instance" "mysql_prod" {
 }
 # No NAT Gateway configured — Lambda cannot reach external APIs`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["CLOUD-001"],
         category: "cloud",
         difficulty: "hard",
     },
@@ -1556,7 +1556,7 @@ resource "aws_rds_cluster" "analytics" {
 }
 # No tags on any resource — impossible to track costs per team/project`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["COST-001"],
         category: "cost-effectiveness",
         difficulty: "easy",
     },
@@ -1591,7 +1591,7 @@ const serverlessConfig = {
     },
   },
 };`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["COST-001"],
         category: "cost-effectiveness",
         difficulty: "medium",
     },
@@ -1661,7 +1661,7 @@ app.post("/api/orders", async (req, res) => {
 }
 // Each instance has its own rate limiter — no coordination across replicas`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SCALE-001"],
         category: "scalability",
         difficulty: "hard",
     },
@@ -1908,7 +1908,7 @@ const server = new ApolloServer({
 });
 startStandaloneServer(server, { listen: { port: 4000 } });`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["RATE-001"],
         category: "rate-limiting",
         difficulty: "hard",
     },
@@ -1946,7 +1946,7 @@ wss.on("connection", (ws) => {
   ws.on("close", () => clients.delete(clientId));
 });`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["RATE-001"],
         category: "rate-limiting",
         difficulty: "medium",
     },

package/dist/commands/benchmark-languages.js CHANGED Viewed

@@ -103,7 +103,7 @@ func ReadConfig(path string) (*Config, error) {
   }
   return &cfg, nil
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["ERR-001"],
         category: "error-handling",
         difficulty: "medium",
     },
@@ -496,7 +496,7 @@ post '/webhook' do
   response = URI.open(payload['callback_url']).read
   { status: 'delivered', response: response }.to_json
 end`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SEC-001"],
         category: "security",
         difficulty: "medium",
     },
@@ -604,7 +604,7 @@ public class SessionManager
         return stream.ToArray();
     }
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SEC-001"],
         category: "security",
         difficulty: "medium",
     },
@@ -646,7 +646,7 @@ public class AdminController : ControllerBase
         return Ok(logs);
     }
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["AUTH-001"],
         category: "auth",
         difficulty: "medium",
     },
@@ -682,7 +682,7 @@ public class AdminController : ControllerBase
         return conn;
     }
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["CONC-001"],
         category: "concurrency",
         difficulty: "medium",
     },
@@ -877,7 +877,7 @@ def process_order(items, discount_code=None):
     assert all(item['qty'] > 0 for item in items), "Quantities must be positive"
     if discount_code:
         assert len(discount_code) == 8, "Invalid discount code"`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["ERR-001"],
         category: "error-handling",
         difficulty: "medium",
     },
@@ -909,7 +909,7 @@ void copyData(const char* src) {
     sprintf(dest, "Data: %s (processed at %s)", src, __TIME__);
     processOutput(dest);
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SEC-001"],
         category: "security",
         difficulty: "easy",
     },
@@ -945,7 +945,7 @@ public:
         }
     }
 };`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SEC-001"],
         category: "security",
         difficulty: "hard",
     },
@@ -1697,7 +1697,7 @@ class ApiClient {
     let coords = address["coordinates"] as! [Double]
     return UserProfile(name: name, age: age, city: city, lat: coords[0], lon: coords[1])
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["ERR-001"],
         category: "error-handling",
         difficulty: "easy",
     },
@@ -1739,7 +1739,7 @@ server <- function(input, output, session) {
     end
   end
 end`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SEC-001"],
         category: "security",
         difficulty: "hard",
     },
@@ -1762,7 +1762,7 @@ function executeUserCode(code)
     local fn = loadstring(code)
     fn()
 end`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SEC-001"],
         category: "security",
         difficulty: "medium",
     },

package/dist/commands/benchmark-quality-ops.js CHANGED Viewed

@@ -223,7 +223,7 @@ function connectDatabase(url: string) {
     process.exit(1);
   }
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["ERR-001"],
         category: "error-handling",
         difficulty: "medium",
     },
@@ -647,7 +647,7 @@ async function cleanup(userId: string) {
   clearUserCache(userId);
   revokeTokens(userId);
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["CONC-001"],
         category: "concurrency",
         difficulty: "medium",
     },
@@ -1964,7 +1964,7 @@ async function deleteUser(userId: string) {
     expect(mockDb.create).toHaveBeenCalled();
   });
 });`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["TEST-001"],
         category: "testing",
         difficulty: "hard",
     },
@@ -2058,7 +2058,7 @@ public class OrderController {
         return ResponseEntity.ok(order);
     }
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["OBS-001"],
         category: "observability",
         difficulty: "medium",
     },
@@ -2155,7 +2155,7 @@ export function processOrders() {
 # Set MYLIB_HOST (removed in v4, now uses MYLIB_URL)
 # Set MYLIB_PORT (no longer needed)
 # Set MYLIB_SSL=true (now always enabled)`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["DOC-001"],
         category: "documentation",
         difficulty: "medium",
     },
@@ -2177,7 +2177,7 @@ jobs:
       - run: npm run build
       - run: aws s3 sync build/ s3://prod-bucket/
       - run: aws cloudfront create-invalidation --distribution-id EXAMPLE --paths '/*'`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["CICD-001"],
         category: "ci-cd",
         difficulty: "easy",
     },
@@ -2198,7 +2198,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - run: npm run deploy`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["CICD-001"],
         category: "ci-cd",
         difficulty: "easy",
     },

package/dist/commands/benchmark-security-deep.js CHANGED Viewed

@@ -73,7 +73,7 @@ public class FetchServlet extends HttpServlet {
         }
     }
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SEC-001"],
         category: "security",
         difficulty: "medium",
     },
@@ -166,7 +166,7 @@ app.delete("/items", async (req, res) => {
   const result = await db.collection("items").deleteMany(filter);
   res.json({ deleted: result.deletedCount });
 });`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SEC-001"],
         category: "injection",
         difficulty: "medium",
     },
@@ -325,7 +325,7 @@ public class ImportController : ControllerBase
         return Ok(obj.ToString());
     }
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SEC-001"],
         category: "security",
         difficulty: "medium",
     },
@@ -346,7 +346,7 @@ def parse_xml():
     tree = ET.parse(request.stream)
     root = tree.getroot()
     return root.tag`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SEC-001"],
         category: "security",
         difficulty: "medium",
     },
@@ -491,7 +491,7 @@ CORS(app, origins="*", supports_credentials=True)
 @app.route('/api/profile')
 def profile():
     return {"email": "user@example.com"}`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SEC-001"],
         category: "security",
         difficulty: "easy",
     },
@@ -527,7 +527,7 @@ function encrypt(data: string): string {
   encrypted += cipher.final("hex");
   return encrypted;
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SEC-001"],
         category: "security",
         difficulty: "hard",
     },
@@ -922,7 +922,7 @@ func searchHandler(w http.ResponseWriter, r *http.Request) {
     render json: @products
   end
 end`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SEC-001"],
         category: "injection",
         difficulty: "easy",
     },
@@ -948,7 +948,7 @@ def ping():
         text=True
     )
     return result.stdout`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SEC-001"],
         category: "injection",
         difficulty: "easy",
     },
@@ -1067,7 +1067,7 @@ func greetHandler(w http.ResponseWriter, r *http.Request) {
   name := r.URL.Query().Get("name")
   fmt.Fprintf(w, "<h1>Hello %s</h1>", name)
 }`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["SEC-001"],
         category: "xss",
         difficulty: "easy",
     },

package/dist/commands/benchmark.js CHANGED Viewed

@@ -748,7 +748,7 @@ async function processImage(imageBuffer: Buffer) {
 // Connection pool with excessive connections
 const pool = new Pool({ host: "db.server.com", max: 500, idleTimeoutMillis: 0 });`,
-        expectedRuleIds: [],
+        expectedRuleIds: ["COST-001"],
         category: "cost-effectiveness",
         difficulty: "medium",
     },

package/dist/commands/llm-benchmark-optimizer.d.ts ADDED Viewed

@@ -0,0 +1,78 @@
+/**
+ * LLM Benchmark Optimizer — Self-Teaching Feedback Loop
+ *
+ * Analyzes benchmark snapshots to identify systematic weaknesses
+ * (high-FP judges, problematic categories, difficulty gaps) and
+ * generates targeted prompt amendments that are applied on the
+ * next benchmark run to improve precision without sacrificing recall.
+ *
+ * Closed loop: run → analyze → amend prompts → run → better scores
+ */
+import type { LlmBenchmarkSnapshot } from "./llm-benchmark.js";
+export interface PromptAmendment {
+    /** Judge rule prefix this amendment targets */
+    judgePrefix: string;
+    /** The amendment text to inject into prompts */
+    amendment: string;
+    /** Why this amendment was generated */
+    reason: string;
+    /** Historical FP rate that triggered this */
+    fpRate: number;
+    /** Benchmark run that generated this */
+    generatedFrom: string;
+    /** ISO timestamp */
+    timestamp: string;
+}
+export interface OptimizerInsight {
+    category: "high-fp-judge" | "missed-category" | "clean-case-leak" | "difficulty-gap";
+    severity: "critical" | "high" | "medium";
+    /** Target identifier (judge prefix, category name, difficulty) */
+    target: string;
+    /** The metric value (FP rate, F1, detection rate) */
+    metric: number;
+    /** Human-readable recommendation */
+    recommendation: string;
+}
+export interface OptimizationResult {
+    amendments: PromptAmendment[];
+    insights: OptimizerInsight[];
+    /** Estimated F1 improvement from applying amendments */
+    projectedF1Improvement: number;
+    /** Summary stats */
+    summary: {
+        worstJudges: string[];
+        worstCategories: string[];
+        amendmentsGenerated: number;
+        currentF1: number;
+        projectedF1: number;
+    };
+}
+export interface AmendmentStore {
+    /** Schema version */
+    version: 1;
+    /** Active amendments to apply on next run */
+    amendments: PromptAmendment[];
+    /** History of past optimizations */
+    history: Array<{
+        timestamp: string;
+        snapshotF1: number;
+        amendmentsApplied: number;
+        amendmentsGenerated: number;
+    }>;
+}
+/**
+ * Analyze a benchmark snapshot and produce optimization results.
+ * This is the main self-teaching entry point.
+ */
+export declare function optimizeBenchmark(snapshot: LlmBenchmarkSnapshot, existingAmendments?: PromptAmendment[]): OptimizationResult;
+/**
+ * Format amendments as a prompt section to inject into tribunal/per-judge prompts.
+ * Returns empty string if no amendments.
+ */
+export declare function formatAmendmentSection(amendments: PromptAmendment[]): string;
+export declare function createEmptyStore(): AmendmentStore;
+/**
+ * Merge new amendments into existing store.
+ * Newer amendments for the same prefix replace older ones.
+ */
+export declare function mergeAmendments(store: AmendmentStore, result: OptimizationResult, snapshotF1: number): AmendmentStore;