@kevinrabun/judges 3.115.4 → 3.117.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. package/agents/accessibility.judge.md +7 -0
  2. package/agents/agent-instructions.judge.md +7 -0
  3. package/agents/ai-code-safety.judge.md +7 -0
  4. package/agents/api-contract.judge.md +7 -0
  5. package/agents/api-design.judge.md +7 -0
  6. package/agents/authentication.judge.md +7 -0
  7. package/agents/backwards-compatibility.judge.md +7 -0
  8. package/agents/caching.judge.md +7 -0
  9. package/agents/ci-cd.judge.md +7 -0
  10. package/agents/cloud-readiness.judge.md +7 -0
  11. package/agents/concurrency.judge.md +7 -0
  12. package/agents/configuration-management.judge.md +7 -0
  13. package/agents/cybersecurity.judge.md +7 -0
  14. package/agents/data-security.judge.md +7 -0
  15. package/agents/dependency-health.judge.md +7 -0
  16. package/agents/documentation.judge.md +7 -0
  17. package/agents/error-handling.judge.md +7 -0
  18. package/agents/ethics-bias.judge.md +7 -0
  19. package/agents/false-positive-review.judge.md +12 -0
  20. package/agents/framework-safety.judge.md +7 -0
  21. package/agents/hallucination-detection.judge.md +13 -0
  22. package/agents/iac-security.judge.md +7 -0
  23. package/agents/intent-alignment.judge.md +13 -0
  24. package/agents/logging-privacy.judge.md +7 -0
  25. package/agents/maintainability.judge.md +7 -0
  26. package/agents/multi-turn-coherence.judge.md +7 -0
  27. package/agents/observability.judge.md +7 -0
  28. package/agents/portability.judge.md +7 -0
  29. package/agents/rate-limiting.judge.md +7 -0
  30. package/agents/reliability.judge.md +7 -0
  31. package/agents/security.judge.md +13 -0
  32. package/agents/testing.judge.md +7 -0
  33. package/agents/ux.judge.md +7 -0
  34. package/dist/a2a-protocol.d.ts +136 -0
  35. package/dist/a2a-protocol.js +218 -0
  36. package/dist/api.d.ts +21 -3
  37. package/dist/api.js +21 -1
  38. package/dist/audit-trail.d.ts +245 -0
  39. package/dist/audit-trail.js +257 -0
  40. package/dist/commands/benchmark-advanced.js +51 -51
  41. package/dist/commands/benchmark-ai-agents.js +16 -16
  42. package/dist/commands/benchmark-compliance-ethics.js +12 -12
  43. package/dist/commands/benchmark-expanded-2.js +2 -2
  44. package/dist/commands/benchmark-expanded.js +2 -2
  45. package/dist/commands/benchmark-infrastructure.js +12 -12
  46. package/dist/commands/benchmark-languages.js +11 -11
  47. package/dist/commands/benchmark-quality-ops.js +7 -7
  48. package/dist/commands/benchmark-security-deep.js +9 -9
  49. package/dist/commands/benchmark.js +1 -1
  50. package/dist/commands/llm-benchmark-optimizer.d.ts +78 -0
  51. package/dist/commands/llm-benchmark-optimizer.js +241 -0
  52. package/dist/commands/llm-benchmark.d.ts +4 -2
  53. package/dist/commands/llm-benchmark.js +40 -12
  54. package/dist/escalation.d.ts +100 -0
  55. package/dist/escalation.js +292 -0
  56. package/dist/evaluation-session.d.ts +74 -0
  57. package/dist/evaluation-session.js +152 -0
  58. package/dist/evaluators/index.d.ts +23 -1
  59. package/dist/evaluators/index.js +192 -3
  60. package/dist/evaluators/judge-selector.d.ts +19 -0
  61. package/dist/evaluators/judge-selector.js +141 -0
  62. package/dist/evaluators/recall-boost.d.ts +27 -0
  63. package/dist/evaluators/recall-boost.js +409 -0
  64. package/dist/feedback-loop.d.ts +62 -0
  65. package/dist/feedback-loop.js +179 -0
  66. package/dist/index.js +2 -0
  67. package/dist/judges/accessibility.js +7 -0
  68. package/dist/judges/agent-instructions.js +7 -0
  69. package/dist/judges/ai-code-safety.js +7 -0
  70. package/dist/judges/api-contract.js +7 -0
  71. package/dist/judges/api-design.js +7 -0
  72. package/dist/judges/authentication.js +7 -0
  73. package/dist/judges/backwards-compatibility.js +7 -0
  74. package/dist/judges/caching.js +7 -0
  75. package/dist/judges/ci-cd.js +7 -0
  76. package/dist/judges/cloud-readiness.js +7 -0
  77. package/dist/judges/concurrency.js +7 -0
  78. package/dist/judges/configuration-management.js +7 -0
  79. package/dist/judges/cybersecurity.js +7 -0
  80. package/dist/judges/data-security.js +7 -0
  81. package/dist/judges/dependency-health.js +7 -0
  82. package/dist/judges/documentation.js +7 -0
  83. package/dist/judges/error-handling.js +7 -0
  84. package/dist/judges/ethics-bias.js +7 -0
  85. package/dist/judges/false-positive-review.js +13 -1
  86. package/dist/judges/framework-safety.js +7 -0
  87. package/dist/judges/hallucination-detection.js +14 -1
  88. package/dist/judges/iac-security.js +7 -0
  89. package/dist/judges/intent-alignment.js +14 -1
  90. package/dist/judges/logging-privacy.js +7 -0
  91. package/dist/judges/maintainability.js +7 -0
  92. package/dist/judges/multi-turn-coherence.js +7 -0
  93. package/dist/judges/observability.js +7 -0
  94. package/dist/judges/portability.js +7 -0
  95. package/dist/judges/rate-limiting.js +7 -0
  96. package/dist/judges/reliability.js +7 -0
  97. package/dist/judges/security.js +14 -1
  98. package/dist/judges/testing.js +7 -0
  99. package/dist/judges/ux.js +7 -0
  100. package/dist/review-conversation.d.ts +87 -0
  101. package/dist/review-conversation.js +307 -0
  102. package/dist/sast-integration.d.ts +112 -0
  103. package/dist/sast-integration.js +215 -0
  104. package/dist/tools/register-evaluation.js +208 -8
  105. package/dist/tools/register-fix.js +24 -1
  106. package/dist/tools/register-resources.d.ts +6 -0
  107. package/dist/tools/register-resources.js +177 -0
  108. package/dist/tools/register-review.js +26 -1
  109. package/dist/tools/register-workflow.js +384 -11
  110. package/dist/tools/validation.d.ts +13 -0
  111. package/dist/tools/validation.js +77 -0
  112. package/dist/types.d.ts +122 -0
  113. package/package.json +25 -12
  114. package/server.json +2 -2
@@ -2258,7 +2258,7 @@ jobs:
2258
2258
  run: |
2259
2259
  echo \${{ github.event.pull_request.title }} # Script injection
2260
2260
  ./deploy.sh`,
2261
- expectedRuleIds: [],
2261
+ expectedRuleIds: ["CICD-001"],
2262
2262
  category: "ci-cd",
2263
2263
  difficulty: "medium",
2264
2264
  },
@@ -4200,7 +4200,7 @@ spec:
4200
4200
  await new Promise(resolve => setTimeout(resolve, 10));
4201
4201
  }
4202
4202
  }`,
4203
- expectedRuleIds: [],
4203
+ expectedRuleIds: ["CONC-001"],
4204
4204
  category: "concurrency",
4205
4205
  difficulty: "hard",
4206
4206
  },
@@ -1644,7 +1644,7 @@ def predict_approval(model, applicant):
1644
1644
  features = [applicant['age'], applicant['income'], applicant['race'],
1645
1645
  applicant['gender'], applicant['zip_code'], applicant['credit_score']]
1646
1646
  return model.predict([features])[0]`,
1647
- expectedRuleIds: [],
1647
+ expectedRuleIds: ["ETHICS-001"],
1648
1648
  category: "ethics-bias",
1649
1649
  difficulty: "medium",
1650
1650
  },
@@ -2309,7 +2309,7 @@ form = cgi.FieldStorage()
2309
2309
  username = form.getfirst("username")
2310
2310
 
2311
2311
  module = imp.load_source("config", "/etc/app/config.py")`,
2312
- expectedRuleIds: [],
2312
+ expectedRuleIds: ["FW-001"],
2313
2313
  category: "ai-code-safety",
2314
2314
  difficulty: "medium",
2315
2315
  },
@@ -259,7 +259,7 @@ const GCP_SERVICE_KEY = '{"type":"service_account","project_id":"my-project","pr
259
259
  };
260
260
 
261
261
  export default config;`,
262
- expectedRuleIds: [],
262
+ expectedRuleIds: ["CFG-001"],
263
263
  category: "configuration",
264
264
  difficulty: "easy",
265
265
  },
@@ -748,7 +748,7 @@ resource "aws_s3_bucket_versioning" "sensitive_versioning" {
748
748
  status = "Enabled"
749
749
  }
750
750
  }`,
751
- expectedRuleIds: [],
751
+ expectedRuleIds: ["IAC-001"],
752
752
  category: "iac-security",
753
753
  difficulty: "medium",
754
754
  },
@@ -841,7 +841,7 @@ resource "aws_volume_attachment" "data_attach" {
841
841
  volume_id = aws_ebs_volume.data_volume.id
842
842
  instance_id = aws_instance.app_server.id
843
843
  }`,
844
- expectedRuleIds: [],
844
+ expectedRuleIds: ["IAC-001"],
845
845
  category: "iac-security",
846
846
  difficulty: "easy",
847
847
  },
@@ -1259,7 +1259,7 @@ jobs:
1259
1259
  env:
1260
1260
  DB_URL: \${{ secrets.DB_URL }}
1261
1261
  run: npm run test:integration`,
1262
- expectedRuleIds: [],
1262
+ expectedRuleIds: ["CICD-001"],
1263
1263
  category: "cicd",
1264
1264
  difficulty: "hard",
1265
1265
  },
@@ -1340,7 +1340,7 @@ resource "google_compute_firewall" "allow_all" {
1340
1340
 
1341
1341
  source_ranges = ["0.0.0.0/0"]
1342
1342
  }`,
1343
- expectedRuleIds: [],
1343
+ expectedRuleIds: ["CLOUD-001"],
1344
1344
  category: "cloud",
1345
1345
  difficulty: "medium",
1346
1346
  },
@@ -1375,7 +1375,7 @@ resource "aws_db_instance" "mysql_prod" {
1375
1375
  deletion_protection = false
1376
1376
  backup_retention_period = 1
1377
1377
  }`,
1378
- expectedRuleIds: [],
1378
+ expectedRuleIds: ["CLOUD-001"],
1379
1379
  category: "cloud",
1380
1380
  difficulty: "hard",
1381
1381
  },
@@ -1406,7 +1406,7 @@ resource "aws_db_instance" "mysql_prod" {
1406
1406
  }
1407
1407
 
1408
1408
  # No NAT Gateway configured — Lambda cannot reach external APIs`,
1409
- expectedRuleIds: [],
1409
+ expectedRuleIds: ["CLOUD-001"],
1410
1410
  category: "cloud",
1411
1411
  difficulty: "hard",
1412
1412
  },
@@ -1556,7 +1556,7 @@ resource "aws_rds_cluster" "analytics" {
1556
1556
  }
1557
1557
 
1558
1558
  # No tags on any resource — impossible to track costs per team/project`,
1559
- expectedRuleIds: [],
1559
+ expectedRuleIds: ["COST-001"],
1560
1560
  category: "cost-effectiveness",
1561
1561
  difficulty: "easy",
1562
1562
  },
@@ -1591,7 +1591,7 @@ const serverlessConfig = {
1591
1591
  },
1592
1592
  },
1593
1593
  };`,
1594
- expectedRuleIds: [],
1594
+ expectedRuleIds: ["COST-001"],
1595
1595
  category: "cost-effectiveness",
1596
1596
  difficulty: "medium",
1597
1597
  },
@@ -1661,7 +1661,7 @@ app.post("/api/orders", async (req, res) => {
1661
1661
  }
1662
1662
 
1663
1663
  // Each instance has its own rate limiter — no coordination across replicas`,
1664
- expectedRuleIds: [],
1664
+ expectedRuleIds: ["SCALE-001"],
1665
1665
  category: "scalability",
1666
1666
  difficulty: "hard",
1667
1667
  },
@@ -1908,7 +1908,7 @@ const server = new ApolloServer({
1908
1908
  });
1909
1909
 
1910
1910
  startStandaloneServer(server, { listen: { port: 4000 } });`,
1911
- expectedRuleIds: [],
1911
+ expectedRuleIds: ["RATE-001"],
1912
1912
  category: "rate-limiting",
1913
1913
  difficulty: "hard",
1914
1914
  },
@@ -1946,7 +1946,7 @@ wss.on("connection", (ws) => {
1946
1946
 
1947
1947
  ws.on("close", () => clients.delete(clientId));
1948
1948
  });`,
1949
- expectedRuleIds: [],
1949
+ expectedRuleIds: ["RATE-001"],
1950
1950
  category: "rate-limiting",
1951
1951
  difficulty: "medium",
1952
1952
  },
@@ -103,7 +103,7 @@ func ReadConfig(path string) (*Config, error) {
103
103
  }
104
104
  return &cfg, nil
105
105
  }`,
106
- expectedRuleIds: [],
106
+ expectedRuleIds: ["ERR-001"],
107
107
  category: "error-handling",
108
108
  difficulty: "medium",
109
109
  },
@@ -496,7 +496,7 @@ post '/webhook' do
496
496
  response = URI.open(payload['callback_url']).read
497
497
  { status: 'delivered', response: response }.to_json
498
498
  end`,
499
- expectedRuleIds: [],
499
+ expectedRuleIds: ["SEC-001"],
500
500
  category: "security",
501
501
  difficulty: "medium",
502
502
  },
@@ -604,7 +604,7 @@ public class SessionManager
604
604
  return stream.ToArray();
605
605
  }
606
606
  }`,
607
- expectedRuleIds: [],
607
+ expectedRuleIds: ["SEC-001"],
608
608
  category: "security",
609
609
  difficulty: "medium",
610
610
  },
@@ -646,7 +646,7 @@ public class AdminController : ControllerBase
646
646
  return Ok(logs);
647
647
  }
648
648
  }`,
649
- expectedRuleIds: [],
649
+ expectedRuleIds: ["AUTH-001"],
650
650
  category: "auth",
651
651
  difficulty: "medium",
652
652
  },
@@ -682,7 +682,7 @@ public class AdminController : ControllerBase
682
682
  return conn;
683
683
  }
684
684
  }`,
685
- expectedRuleIds: [],
685
+ expectedRuleIds: ["CONC-001"],
686
686
  category: "concurrency",
687
687
  difficulty: "medium",
688
688
  },
@@ -877,7 +877,7 @@ def process_order(items, discount_code=None):
877
877
  assert all(item['qty'] > 0 for item in items), "Quantities must be positive"
878
878
  if discount_code:
879
879
  assert len(discount_code) == 8, "Invalid discount code"`,
880
- expectedRuleIds: [],
880
+ expectedRuleIds: ["ERR-001"],
881
881
  category: "error-handling",
882
882
  difficulty: "medium",
883
883
  },
@@ -909,7 +909,7 @@ void copyData(const char* src) {
909
909
  sprintf(dest, "Data: %s (processed at %s)", src, __TIME__);
910
910
  processOutput(dest);
911
911
  }`,
912
- expectedRuleIds: [],
912
+ expectedRuleIds: ["SEC-001"],
913
913
  category: "security",
914
914
  difficulty: "easy",
915
915
  },
@@ -945,7 +945,7 @@ public:
945
945
  }
946
946
  }
947
947
  };`,
948
- expectedRuleIds: [],
948
+ expectedRuleIds: ["SEC-001"],
949
949
  category: "security",
950
950
  difficulty: "hard",
951
951
  },
@@ -1697,7 +1697,7 @@ class ApiClient {
1697
1697
  let coords = address["coordinates"] as! [Double]
1698
1698
  return UserProfile(name: name, age: age, city: city, lat: coords[0], lon: coords[1])
1699
1699
  }`,
1700
- expectedRuleIds: [],
1700
+ expectedRuleIds: ["ERR-001"],
1701
1701
  category: "error-handling",
1702
1702
  difficulty: "easy",
1703
1703
  },
@@ -1739,7 +1739,7 @@ server <- function(input, output, session) {
1739
1739
  end
1740
1740
  end
1741
1741
  end`,
1742
- expectedRuleIds: [],
1742
+ expectedRuleIds: ["SEC-001"],
1743
1743
  category: "security",
1744
1744
  difficulty: "hard",
1745
1745
  },
@@ -1762,7 +1762,7 @@ function executeUserCode(code)
1762
1762
  local fn = loadstring(code)
1763
1763
  fn()
1764
1764
  end`,
1765
- expectedRuleIds: [],
1765
+ expectedRuleIds: ["SEC-001"],
1766
1766
  category: "security",
1767
1767
  difficulty: "medium",
1768
1768
  },
@@ -223,7 +223,7 @@ function connectDatabase(url: string) {
223
223
  process.exit(1);
224
224
  }
225
225
  }`,
226
- expectedRuleIds: [],
226
+ expectedRuleIds: ["ERR-001"],
227
227
  category: "error-handling",
228
228
  difficulty: "medium",
229
229
  },
@@ -647,7 +647,7 @@ async function cleanup(userId: string) {
647
647
  clearUserCache(userId);
648
648
  revokeTokens(userId);
649
649
  }`,
650
- expectedRuleIds: [],
650
+ expectedRuleIds: ["CONC-001"],
651
651
  category: "concurrency",
652
652
  difficulty: "medium",
653
653
  },
@@ -1964,7 +1964,7 @@ async function deleteUser(userId: string) {
1964
1964
  expect(mockDb.create).toHaveBeenCalled();
1965
1965
  });
1966
1966
  });`,
1967
- expectedRuleIds: [],
1967
+ expectedRuleIds: ["TEST-001"],
1968
1968
  category: "testing",
1969
1969
  difficulty: "hard",
1970
1970
  },
@@ -2058,7 +2058,7 @@ public class OrderController {
2058
2058
  return ResponseEntity.ok(order);
2059
2059
  }
2060
2060
  }`,
2061
- expectedRuleIds: [],
2061
+ expectedRuleIds: ["OBS-001"],
2062
2062
  category: "observability",
2063
2063
  difficulty: "medium",
2064
2064
  },
@@ -2155,7 +2155,7 @@ export function processOrders() {
2155
2155
  # Set MYLIB_HOST (removed in v4, now uses MYLIB_URL)
2156
2156
  # Set MYLIB_PORT (no longer needed)
2157
2157
  # Set MYLIB_SSL=true (now always enabled)`,
2158
- expectedRuleIds: [],
2158
+ expectedRuleIds: ["DOC-001"],
2159
2159
  category: "documentation",
2160
2160
  difficulty: "medium",
2161
2161
  },
@@ -2177,7 +2177,7 @@ jobs:
2177
2177
  - run: npm run build
2178
2178
  - run: aws s3 sync build/ s3://prod-bucket/
2179
2179
  - run: aws cloudfront create-invalidation --distribution-id EXAMPLE --paths '/*'`,
2180
- expectedRuleIds: [],
2180
+ expectedRuleIds: ["CICD-001"],
2181
2181
  category: "ci-cd",
2182
2182
  difficulty: "easy",
2183
2183
  },
@@ -2198,7 +2198,7 @@ jobs:
2198
2198
  steps:
2199
2199
  - uses: actions/checkout@v4
2200
2200
  - run: npm run deploy`,
2201
- expectedRuleIds: [],
2201
+ expectedRuleIds: ["CICD-001"],
2202
2202
  category: "ci-cd",
2203
2203
  difficulty: "easy",
2204
2204
  },
@@ -73,7 +73,7 @@ public class FetchServlet extends HttpServlet {
73
73
  }
74
74
  }
75
75
  }`,
76
- expectedRuleIds: [],
76
+ expectedRuleIds: ["SEC-001"],
77
77
  category: "security",
78
78
  difficulty: "medium",
79
79
  },
@@ -166,7 +166,7 @@ app.delete("/items", async (req, res) => {
166
166
  const result = await db.collection("items").deleteMany(filter);
167
167
  res.json({ deleted: result.deletedCount });
168
168
  });`,
169
- expectedRuleIds: [],
169
+ expectedRuleIds: ["SEC-001"],
170
170
  category: "injection",
171
171
  difficulty: "medium",
172
172
  },
@@ -325,7 +325,7 @@ public class ImportController : ControllerBase
325
325
  return Ok(obj.ToString());
326
326
  }
327
327
  }`,
328
- expectedRuleIds: [],
328
+ expectedRuleIds: ["SEC-001"],
329
329
  category: "security",
330
330
  difficulty: "medium",
331
331
  },
@@ -346,7 +346,7 @@ def parse_xml():
346
346
  tree = ET.parse(request.stream)
347
347
  root = tree.getroot()
348
348
  return root.tag`,
349
- expectedRuleIds: [],
349
+ expectedRuleIds: ["SEC-001"],
350
350
  category: "security",
351
351
  difficulty: "medium",
352
352
  },
@@ -491,7 +491,7 @@ CORS(app, origins="*", supports_credentials=True)
491
491
  @app.route('/api/profile')
492
492
  def profile():
493
493
  return {"email": "user@example.com"}`,
494
- expectedRuleIds: [],
494
+ expectedRuleIds: ["SEC-001"],
495
495
  category: "security",
496
496
  difficulty: "easy",
497
497
  },
@@ -527,7 +527,7 @@ function encrypt(data: string): string {
527
527
  encrypted += cipher.final("hex");
528
528
  return encrypted;
529
529
  }`,
530
- expectedRuleIds: [],
530
+ expectedRuleIds: ["SEC-001"],
531
531
  category: "security",
532
532
  difficulty: "hard",
533
533
  },
@@ -922,7 +922,7 @@ func searchHandler(w http.ResponseWriter, r *http.Request) {
922
922
  render json: @products
923
923
  end
924
924
  end`,
925
- expectedRuleIds: [],
925
+ expectedRuleIds: ["SEC-001"],
926
926
  category: "injection",
927
927
  difficulty: "easy",
928
928
  },
@@ -948,7 +948,7 @@ def ping():
948
948
  text=True
949
949
  )
950
950
  return result.stdout`,
951
- expectedRuleIds: [],
951
+ expectedRuleIds: ["SEC-001"],
952
952
  category: "injection",
953
953
  difficulty: "easy",
954
954
  },
@@ -1067,7 +1067,7 @@ func greetHandler(w http.ResponseWriter, r *http.Request) {
1067
1067
  name := r.URL.Query().Get("name")
1068
1068
  fmt.Fprintf(w, "<h1>Hello %s</h1>", name)
1069
1069
  }`,
1070
- expectedRuleIds: [],
1070
+ expectedRuleIds: ["SEC-001"],
1071
1071
  category: "xss",
1072
1072
  difficulty: "easy",
1073
1073
  },
@@ -748,7 +748,7 @@ async function processImage(imageBuffer: Buffer) {
748
748
 
749
749
  // Connection pool with excessive connections
750
750
  const pool = new Pool({ host: "db.server.com", max: 500, idleTimeoutMillis: 0 });`,
751
- expectedRuleIds: [],
751
+ expectedRuleIds: ["COST-001"],
752
752
  category: "cost-effectiveness",
753
753
  difficulty: "medium",
754
754
  },
@@ -0,0 +1,78 @@
1
+ /**
2
+ * LLM Benchmark Optimizer — Self-Teaching Feedback Loop
3
+ *
4
+ * Analyzes benchmark snapshots to identify systematic weaknesses
5
+ * (high-FP judges, problematic categories, difficulty gaps) and
6
+ * generates targeted prompt amendments that are applied on the
7
+ * next benchmark run to improve precision without sacrificing recall.
8
+ *
9
+ * Closed loop: run → analyze → amend prompts → run → better scores
10
+ */
11
+ import type { LlmBenchmarkSnapshot } from "./llm-benchmark.js";
12
+ export interface PromptAmendment {
13
+ /** Judge rule prefix this amendment targets */
14
+ judgePrefix: string;
15
+ /** The amendment text to inject into prompts */
16
+ amendment: string;
17
+ /** Why this amendment was generated */
18
+ reason: string;
19
+ /** Historical FP rate that triggered this */
20
+ fpRate: number;
21
+ /** Benchmark run that generated this */
22
+ generatedFrom: string;
23
+ /** ISO timestamp */
24
+ timestamp: string;
25
+ }
26
+ export interface OptimizerInsight {
27
+ category: "high-fp-judge" | "missed-category" | "clean-case-leak" | "difficulty-gap";
28
+ severity: "critical" | "high" | "medium";
29
+ /** Target identifier (judge prefix, category name, difficulty) */
30
+ target: string;
31
+ /** The metric value (FP rate, F1, detection rate) */
32
+ metric: number;
33
+ /** Human-readable recommendation */
34
+ recommendation: string;
35
+ }
36
+ export interface OptimizationResult {
37
+ amendments: PromptAmendment[];
38
+ insights: OptimizerInsight[];
39
+ /** Estimated F1 improvement from applying amendments */
40
+ projectedF1Improvement: number;
41
+ /** Summary stats */
42
+ summary: {
43
+ worstJudges: string[];
44
+ worstCategories: string[];
45
+ amendmentsGenerated: number;
46
+ currentF1: number;
47
+ projectedF1: number;
48
+ };
49
+ }
50
+ export interface AmendmentStore {
51
+ /** Schema version */
52
+ version: 1;
53
+ /** Active amendments to apply on next run */
54
+ amendments: PromptAmendment[];
55
+ /** History of past optimizations */
56
+ history: Array<{
57
+ timestamp: string;
58
+ snapshotF1: number;
59
+ amendmentsApplied: number;
60
+ amendmentsGenerated: number;
61
+ }>;
62
+ }
63
+ /**
64
+ * Analyze a benchmark snapshot and produce optimization results.
65
+ * This is the main self-teaching entry point.
66
+ */
67
+ export declare function optimizeBenchmark(snapshot: LlmBenchmarkSnapshot, existingAmendments?: PromptAmendment[]): OptimizationResult;
68
+ /**
69
+ * Format amendments as a prompt section to inject into tribunal/per-judge prompts.
70
+ * Returns empty string if no amendments.
71
+ */
72
+ export declare function formatAmendmentSection(amendments: PromptAmendment[]): string;
73
+ export declare function createEmptyStore(): AmendmentStore;
74
+ /**
75
+ * Merge new amendments into existing store.
76
+ * Newer amendments for the same prefix replace older ones.
77
+ */
78
+ export declare function mergeAmendments(store: AmendmentStore, result: OptimizationResult, snapshotF1: number): AmendmentStore;