@kevinrabun/judges 3.115.4 → 3.117.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/agents/accessibility.judge.md +7 -0
- package/agents/agent-instructions.judge.md +7 -0
- package/agents/ai-code-safety.judge.md +7 -0
- package/agents/api-contract.judge.md +7 -0
- package/agents/api-design.judge.md +7 -0
- package/agents/authentication.judge.md +7 -0
- package/agents/backwards-compatibility.judge.md +7 -0
- package/agents/caching.judge.md +7 -0
- package/agents/ci-cd.judge.md +7 -0
- package/agents/cloud-readiness.judge.md +7 -0
- package/agents/concurrency.judge.md +7 -0
- package/agents/configuration-management.judge.md +7 -0
- package/agents/cybersecurity.judge.md +7 -0
- package/agents/data-security.judge.md +7 -0
- package/agents/dependency-health.judge.md +7 -0
- package/agents/documentation.judge.md +7 -0
- package/agents/error-handling.judge.md +7 -0
- package/agents/ethics-bias.judge.md +7 -0
- package/agents/false-positive-review.judge.md +12 -0
- package/agents/framework-safety.judge.md +7 -0
- package/agents/hallucination-detection.judge.md +13 -0
- package/agents/iac-security.judge.md +7 -0
- package/agents/intent-alignment.judge.md +13 -0
- package/agents/logging-privacy.judge.md +7 -0
- package/agents/maintainability.judge.md +7 -0
- package/agents/multi-turn-coherence.judge.md +7 -0
- package/agents/observability.judge.md +7 -0
- package/agents/portability.judge.md +7 -0
- package/agents/rate-limiting.judge.md +7 -0
- package/agents/reliability.judge.md +7 -0
- package/agents/security.judge.md +13 -0
- package/agents/testing.judge.md +7 -0
- package/agents/ux.judge.md +7 -0
- package/dist/a2a-protocol.d.ts +136 -0
- package/dist/a2a-protocol.js +218 -0
- package/dist/api.d.ts +21 -3
- package/dist/api.js +21 -1
- package/dist/audit-trail.d.ts +245 -0
- package/dist/audit-trail.js +257 -0
- package/dist/commands/benchmark-advanced.js +51 -51
- package/dist/commands/benchmark-ai-agents.js +16 -16
- package/dist/commands/benchmark-compliance-ethics.js +12 -12
- package/dist/commands/benchmark-expanded-2.js +2 -2
- package/dist/commands/benchmark-expanded.js +2 -2
- package/dist/commands/benchmark-infrastructure.js +12 -12
- package/dist/commands/benchmark-languages.js +11 -11
- package/dist/commands/benchmark-quality-ops.js +7 -7
- package/dist/commands/benchmark-security-deep.js +9 -9
- package/dist/commands/benchmark.js +1 -1
- package/dist/commands/llm-benchmark-optimizer.d.ts +78 -0
- package/dist/commands/llm-benchmark-optimizer.js +241 -0
- package/dist/commands/llm-benchmark.d.ts +4 -2
- package/dist/commands/llm-benchmark.js +40 -12
- package/dist/escalation.d.ts +100 -0
- package/dist/escalation.js +292 -0
- package/dist/evaluation-session.d.ts +74 -0
- package/dist/evaluation-session.js +152 -0
- package/dist/evaluators/index.d.ts +23 -1
- package/dist/evaluators/index.js +192 -3
- package/dist/evaluators/judge-selector.d.ts +19 -0
- package/dist/evaluators/judge-selector.js +141 -0
- package/dist/evaluators/recall-boost.d.ts +27 -0
- package/dist/evaluators/recall-boost.js +409 -0
- package/dist/feedback-loop.d.ts +62 -0
- package/dist/feedback-loop.js +179 -0
- package/dist/index.js +2 -0
- package/dist/judges/accessibility.js +7 -0
- package/dist/judges/agent-instructions.js +7 -0
- package/dist/judges/ai-code-safety.js +7 -0
- package/dist/judges/api-contract.js +7 -0
- package/dist/judges/api-design.js +7 -0
- package/dist/judges/authentication.js +7 -0
- package/dist/judges/backwards-compatibility.js +7 -0
- package/dist/judges/caching.js +7 -0
- package/dist/judges/ci-cd.js +7 -0
- package/dist/judges/cloud-readiness.js +7 -0
- package/dist/judges/concurrency.js +7 -0
- package/dist/judges/configuration-management.js +7 -0
- package/dist/judges/cybersecurity.js +7 -0
- package/dist/judges/data-security.js +7 -0
- package/dist/judges/dependency-health.js +7 -0
- package/dist/judges/documentation.js +7 -0
- package/dist/judges/error-handling.js +7 -0
- package/dist/judges/ethics-bias.js +7 -0
- package/dist/judges/false-positive-review.js +13 -1
- package/dist/judges/framework-safety.js +7 -0
- package/dist/judges/hallucination-detection.js +14 -1
- package/dist/judges/iac-security.js +7 -0
- package/dist/judges/intent-alignment.js +14 -1
- package/dist/judges/logging-privacy.js +7 -0
- package/dist/judges/maintainability.js +7 -0
- package/dist/judges/multi-turn-coherence.js +7 -0
- package/dist/judges/observability.js +7 -0
- package/dist/judges/portability.js +7 -0
- package/dist/judges/rate-limiting.js +7 -0
- package/dist/judges/reliability.js +7 -0
- package/dist/judges/security.js +14 -1
- package/dist/judges/testing.js +7 -0
- package/dist/judges/ux.js +7 -0
- package/dist/review-conversation.d.ts +87 -0
- package/dist/review-conversation.js +307 -0
- package/dist/sast-integration.d.ts +112 -0
- package/dist/sast-integration.js +215 -0
- package/dist/tools/register-evaluation.js +208 -8
- package/dist/tools/register-fix.js +24 -1
- package/dist/tools/register-resources.d.ts +6 -0
- package/dist/tools/register-resources.js +177 -0
- package/dist/tools/register-review.js +26 -1
- package/dist/tools/register-workflow.js +384 -11
- package/dist/tools/validation.d.ts +13 -0
- package/dist/tools/validation.js +77 -0
- package/dist/types.d.ts +122 -0
- package/package.json +25 -12
- package/server.json +2 -2
|
@@ -2258,7 +2258,7 @@ jobs:
|
|
|
2258
2258
|
run: |
|
|
2259
2259
|
echo \${{ github.event.pull_request.title }} # Script injection
|
|
2260
2260
|
./deploy.sh`,
|
|
2261
|
-
expectedRuleIds: [],
|
|
2261
|
+
expectedRuleIds: ["CICD-001"],
|
|
2262
2262
|
category: "ci-cd",
|
|
2263
2263
|
difficulty: "medium",
|
|
2264
2264
|
},
|
|
@@ -4200,7 +4200,7 @@ spec:
|
|
|
4200
4200
|
await new Promise(resolve => setTimeout(resolve, 10));
|
|
4201
4201
|
}
|
|
4202
4202
|
}`,
|
|
4203
|
-
expectedRuleIds: [],
|
|
4203
|
+
expectedRuleIds: ["CONC-001"],
|
|
4204
4204
|
category: "concurrency",
|
|
4205
4205
|
difficulty: "hard",
|
|
4206
4206
|
},
|
|
@@ -1644,7 +1644,7 @@ def predict_approval(model, applicant):
|
|
|
1644
1644
|
features = [applicant['age'], applicant['income'], applicant['race'],
|
|
1645
1645
|
applicant['gender'], applicant['zip_code'], applicant['credit_score']]
|
|
1646
1646
|
return model.predict([features])[0]`,
|
|
1647
|
-
expectedRuleIds: [],
|
|
1647
|
+
expectedRuleIds: ["ETHICS-001"],
|
|
1648
1648
|
category: "ethics-bias",
|
|
1649
1649
|
difficulty: "medium",
|
|
1650
1650
|
},
|
|
@@ -2309,7 +2309,7 @@ form = cgi.FieldStorage()
|
|
|
2309
2309
|
username = form.getfirst("username")
|
|
2310
2310
|
|
|
2311
2311
|
module = imp.load_source("config", "/etc/app/config.py")`,
|
|
2312
|
-
expectedRuleIds: [],
|
|
2312
|
+
expectedRuleIds: ["FW-001"],
|
|
2313
2313
|
category: "ai-code-safety",
|
|
2314
2314
|
difficulty: "medium",
|
|
2315
2315
|
},
|
|
@@ -259,7 +259,7 @@ const GCP_SERVICE_KEY = '{"type":"service_account","project_id":"my-project","pr
|
|
|
259
259
|
};
|
|
260
260
|
|
|
261
261
|
export default config;`,
|
|
262
|
-
expectedRuleIds: [],
|
|
262
|
+
expectedRuleIds: ["CFG-001"],
|
|
263
263
|
category: "configuration",
|
|
264
264
|
difficulty: "easy",
|
|
265
265
|
},
|
|
@@ -748,7 +748,7 @@ resource "aws_s3_bucket_versioning" "sensitive_versioning" {
|
|
|
748
748
|
status = "Enabled"
|
|
749
749
|
}
|
|
750
750
|
}`,
|
|
751
|
-
expectedRuleIds: [],
|
|
751
|
+
expectedRuleIds: ["IAC-001"],
|
|
752
752
|
category: "iac-security",
|
|
753
753
|
difficulty: "medium",
|
|
754
754
|
},
|
|
@@ -841,7 +841,7 @@ resource "aws_volume_attachment" "data_attach" {
|
|
|
841
841
|
volume_id = aws_ebs_volume.data_volume.id
|
|
842
842
|
instance_id = aws_instance.app_server.id
|
|
843
843
|
}`,
|
|
844
|
-
expectedRuleIds: [],
|
|
844
|
+
expectedRuleIds: ["IAC-001"],
|
|
845
845
|
category: "iac-security",
|
|
846
846
|
difficulty: "easy",
|
|
847
847
|
},
|
|
@@ -1259,7 +1259,7 @@ jobs:
|
|
|
1259
1259
|
env:
|
|
1260
1260
|
DB_URL: \${{ secrets.DB_URL }}
|
|
1261
1261
|
run: npm run test:integration`,
|
|
1262
|
-
expectedRuleIds: [],
|
|
1262
|
+
expectedRuleIds: ["CICD-001"],
|
|
1263
1263
|
category: "cicd",
|
|
1264
1264
|
difficulty: "hard",
|
|
1265
1265
|
},
|
|
@@ -1340,7 +1340,7 @@ resource "google_compute_firewall" "allow_all" {
|
|
|
1340
1340
|
|
|
1341
1341
|
source_ranges = ["0.0.0.0/0"]
|
|
1342
1342
|
}`,
|
|
1343
|
-
expectedRuleIds: [],
|
|
1343
|
+
expectedRuleIds: ["CLOUD-001"],
|
|
1344
1344
|
category: "cloud",
|
|
1345
1345
|
difficulty: "medium",
|
|
1346
1346
|
},
|
|
@@ -1375,7 +1375,7 @@ resource "aws_db_instance" "mysql_prod" {
|
|
|
1375
1375
|
deletion_protection = false
|
|
1376
1376
|
backup_retention_period = 1
|
|
1377
1377
|
}`,
|
|
1378
|
-
expectedRuleIds: [],
|
|
1378
|
+
expectedRuleIds: ["CLOUD-001"],
|
|
1379
1379
|
category: "cloud",
|
|
1380
1380
|
difficulty: "hard",
|
|
1381
1381
|
},
|
|
@@ -1406,7 +1406,7 @@ resource "aws_db_instance" "mysql_prod" {
|
|
|
1406
1406
|
}
|
|
1407
1407
|
|
|
1408
1408
|
# No NAT Gateway configured — Lambda cannot reach external APIs`,
|
|
1409
|
-
expectedRuleIds: [],
|
|
1409
|
+
expectedRuleIds: ["CLOUD-001"],
|
|
1410
1410
|
category: "cloud",
|
|
1411
1411
|
difficulty: "hard",
|
|
1412
1412
|
},
|
|
@@ -1556,7 +1556,7 @@ resource "aws_rds_cluster" "analytics" {
|
|
|
1556
1556
|
}
|
|
1557
1557
|
|
|
1558
1558
|
# No tags on any resource — impossible to track costs per team/project`,
|
|
1559
|
-
expectedRuleIds: [],
|
|
1559
|
+
expectedRuleIds: ["COST-001"],
|
|
1560
1560
|
category: "cost-effectiveness",
|
|
1561
1561
|
difficulty: "easy",
|
|
1562
1562
|
},
|
|
@@ -1591,7 +1591,7 @@ const serverlessConfig = {
|
|
|
1591
1591
|
},
|
|
1592
1592
|
},
|
|
1593
1593
|
};`,
|
|
1594
|
-
expectedRuleIds: [],
|
|
1594
|
+
expectedRuleIds: ["COST-001"],
|
|
1595
1595
|
category: "cost-effectiveness",
|
|
1596
1596
|
difficulty: "medium",
|
|
1597
1597
|
},
|
|
@@ -1661,7 +1661,7 @@ app.post("/api/orders", async (req, res) => {
|
|
|
1661
1661
|
}
|
|
1662
1662
|
|
|
1663
1663
|
// Each instance has its own rate limiter — no coordination across replicas`,
|
|
1664
|
-
expectedRuleIds: [],
|
|
1664
|
+
expectedRuleIds: ["SCALE-001"],
|
|
1665
1665
|
category: "scalability",
|
|
1666
1666
|
difficulty: "hard",
|
|
1667
1667
|
},
|
|
@@ -1908,7 +1908,7 @@ const server = new ApolloServer({
|
|
|
1908
1908
|
});
|
|
1909
1909
|
|
|
1910
1910
|
startStandaloneServer(server, { listen: { port: 4000 } });`,
|
|
1911
|
-
expectedRuleIds: [],
|
|
1911
|
+
expectedRuleIds: ["RATE-001"],
|
|
1912
1912
|
category: "rate-limiting",
|
|
1913
1913
|
difficulty: "hard",
|
|
1914
1914
|
},
|
|
@@ -1946,7 +1946,7 @@ wss.on("connection", (ws) => {
|
|
|
1946
1946
|
|
|
1947
1947
|
ws.on("close", () => clients.delete(clientId));
|
|
1948
1948
|
});`,
|
|
1949
|
-
expectedRuleIds: [],
|
|
1949
|
+
expectedRuleIds: ["RATE-001"],
|
|
1950
1950
|
category: "rate-limiting",
|
|
1951
1951
|
difficulty: "medium",
|
|
1952
1952
|
},
|
|
@@ -103,7 +103,7 @@ func ReadConfig(path string) (*Config, error) {
|
|
|
103
103
|
}
|
|
104
104
|
return &cfg, nil
|
|
105
105
|
}`,
|
|
106
|
-
expectedRuleIds: [],
|
|
106
|
+
expectedRuleIds: ["ERR-001"],
|
|
107
107
|
category: "error-handling",
|
|
108
108
|
difficulty: "medium",
|
|
109
109
|
},
|
|
@@ -496,7 +496,7 @@ post '/webhook' do
|
|
|
496
496
|
response = URI.open(payload['callback_url']).read
|
|
497
497
|
{ status: 'delivered', response: response }.to_json
|
|
498
498
|
end`,
|
|
499
|
-
expectedRuleIds: [],
|
|
499
|
+
expectedRuleIds: ["SEC-001"],
|
|
500
500
|
category: "security",
|
|
501
501
|
difficulty: "medium",
|
|
502
502
|
},
|
|
@@ -604,7 +604,7 @@ public class SessionManager
|
|
|
604
604
|
return stream.ToArray();
|
|
605
605
|
}
|
|
606
606
|
}`,
|
|
607
|
-
expectedRuleIds: [],
|
|
607
|
+
expectedRuleIds: ["SEC-001"],
|
|
608
608
|
category: "security",
|
|
609
609
|
difficulty: "medium",
|
|
610
610
|
},
|
|
@@ -646,7 +646,7 @@ public class AdminController : ControllerBase
|
|
|
646
646
|
return Ok(logs);
|
|
647
647
|
}
|
|
648
648
|
}`,
|
|
649
|
-
expectedRuleIds: [],
|
|
649
|
+
expectedRuleIds: ["AUTH-001"],
|
|
650
650
|
category: "auth",
|
|
651
651
|
difficulty: "medium",
|
|
652
652
|
},
|
|
@@ -682,7 +682,7 @@ public class AdminController : ControllerBase
|
|
|
682
682
|
return conn;
|
|
683
683
|
}
|
|
684
684
|
}`,
|
|
685
|
-
expectedRuleIds: [],
|
|
685
|
+
expectedRuleIds: ["CONC-001"],
|
|
686
686
|
category: "concurrency",
|
|
687
687
|
difficulty: "medium",
|
|
688
688
|
},
|
|
@@ -877,7 +877,7 @@ def process_order(items, discount_code=None):
|
|
|
877
877
|
assert all(item['qty'] > 0 for item in items), "Quantities must be positive"
|
|
878
878
|
if discount_code:
|
|
879
879
|
assert len(discount_code) == 8, "Invalid discount code"`,
|
|
880
|
-
expectedRuleIds: [],
|
|
880
|
+
expectedRuleIds: ["ERR-001"],
|
|
881
881
|
category: "error-handling",
|
|
882
882
|
difficulty: "medium",
|
|
883
883
|
},
|
|
@@ -909,7 +909,7 @@ void copyData(const char* src) {
|
|
|
909
909
|
sprintf(dest, "Data: %s (processed at %s)", src, __TIME__);
|
|
910
910
|
processOutput(dest);
|
|
911
911
|
}`,
|
|
912
|
-
expectedRuleIds: [],
|
|
912
|
+
expectedRuleIds: ["SEC-001"],
|
|
913
913
|
category: "security",
|
|
914
914
|
difficulty: "easy",
|
|
915
915
|
},
|
|
@@ -945,7 +945,7 @@ public:
|
|
|
945
945
|
}
|
|
946
946
|
}
|
|
947
947
|
};`,
|
|
948
|
-
expectedRuleIds: [],
|
|
948
|
+
expectedRuleIds: ["SEC-001"],
|
|
949
949
|
category: "security",
|
|
950
950
|
difficulty: "hard",
|
|
951
951
|
},
|
|
@@ -1697,7 +1697,7 @@ class ApiClient {
|
|
|
1697
1697
|
let coords = address["coordinates"] as! [Double]
|
|
1698
1698
|
return UserProfile(name: name, age: age, city: city, lat: coords[0], lon: coords[1])
|
|
1699
1699
|
}`,
|
|
1700
|
-
expectedRuleIds: [],
|
|
1700
|
+
expectedRuleIds: ["ERR-001"],
|
|
1701
1701
|
category: "error-handling",
|
|
1702
1702
|
difficulty: "easy",
|
|
1703
1703
|
},
|
|
@@ -1739,7 +1739,7 @@ server <- function(input, output, session) {
|
|
|
1739
1739
|
end
|
|
1740
1740
|
end
|
|
1741
1741
|
end`,
|
|
1742
|
-
expectedRuleIds: [],
|
|
1742
|
+
expectedRuleIds: ["SEC-001"],
|
|
1743
1743
|
category: "security",
|
|
1744
1744
|
difficulty: "hard",
|
|
1745
1745
|
},
|
|
@@ -1762,7 +1762,7 @@ function executeUserCode(code)
|
|
|
1762
1762
|
local fn = loadstring(code)
|
|
1763
1763
|
fn()
|
|
1764
1764
|
end`,
|
|
1765
|
-
expectedRuleIds: [],
|
|
1765
|
+
expectedRuleIds: ["SEC-001"],
|
|
1766
1766
|
category: "security",
|
|
1767
1767
|
difficulty: "medium",
|
|
1768
1768
|
},
|
|
@@ -223,7 +223,7 @@ function connectDatabase(url: string) {
|
|
|
223
223
|
process.exit(1);
|
|
224
224
|
}
|
|
225
225
|
}`,
|
|
226
|
-
expectedRuleIds: [],
|
|
226
|
+
expectedRuleIds: ["ERR-001"],
|
|
227
227
|
category: "error-handling",
|
|
228
228
|
difficulty: "medium",
|
|
229
229
|
},
|
|
@@ -647,7 +647,7 @@ async function cleanup(userId: string) {
|
|
|
647
647
|
clearUserCache(userId);
|
|
648
648
|
revokeTokens(userId);
|
|
649
649
|
}`,
|
|
650
|
-
expectedRuleIds: [],
|
|
650
|
+
expectedRuleIds: ["CONC-001"],
|
|
651
651
|
category: "concurrency",
|
|
652
652
|
difficulty: "medium",
|
|
653
653
|
},
|
|
@@ -1964,7 +1964,7 @@ async function deleteUser(userId: string) {
|
|
|
1964
1964
|
expect(mockDb.create).toHaveBeenCalled();
|
|
1965
1965
|
});
|
|
1966
1966
|
});`,
|
|
1967
|
-
expectedRuleIds: [],
|
|
1967
|
+
expectedRuleIds: ["TEST-001"],
|
|
1968
1968
|
category: "testing",
|
|
1969
1969
|
difficulty: "hard",
|
|
1970
1970
|
},
|
|
@@ -2058,7 +2058,7 @@ public class OrderController {
|
|
|
2058
2058
|
return ResponseEntity.ok(order);
|
|
2059
2059
|
}
|
|
2060
2060
|
}`,
|
|
2061
|
-
expectedRuleIds: [],
|
|
2061
|
+
expectedRuleIds: ["OBS-001"],
|
|
2062
2062
|
category: "observability",
|
|
2063
2063
|
difficulty: "medium",
|
|
2064
2064
|
},
|
|
@@ -2155,7 +2155,7 @@ export function processOrders() {
|
|
|
2155
2155
|
# Set MYLIB_HOST (removed in v4, now uses MYLIB_URL)
|
|
2156
2156
|
# Set MYLIB_PORT (no longer needed)
|
|
2157
2157
|
# Set MYLIB_SSL=true (now always enabled)`,
|
|
2158
|
-
expectedRuleIds: [],
|
|
2158
|
+
expectedRuleIds: ["DOC-001"],
|
|
2159
2159
|
category: "documentation",
|
|
2160
2160
|
difficulty: "medium",
|
|
2161
2161
|
},
|
|
@@ -2177,7 +2177,7 @@ jobs:
|
|
|
2177
2177
|
- run: npm run build
|
|
2178
2178
|
- run: aws s3 sync build/ s3://prod-bucket/
|
|
2179
2179
|
- run: aws cloudfront create-invalidation --distribution-id EXAMPLE --paths '/*'`,
|
|
2180
|
-
expectedRuleIds: [],
|
|
2180
|
+
expectedRuleIds: ["CICD-001"],
|
|
2181
2181
|
category: "ci-cd",
|
|
2182
2182
|
difficulty: "easy",
|
|
2183
2183
|
},
|
|
@@ -2198,7 +2198,7 @@ jobs:
|
|
|
2198
2198
|
steps:
|
|
2199
2199
|
- uses: actions/checkout@v4
|
|
2200
2200
|
- run: npm run deploy`,
|
|
2201
|
-
expectedRuleIds: [],
|
|
2201
|
+
expectedRuleIds: ["CICD-001"],
|
|
2202
2202
|
category: "ci-cd",
|
|
2203
2203
|
difficulty: "easy",
|
|
2204
2204
|
},
|
|
@@ -73,7 +73,7 @@ public class FetchServlet extends HttpServlet {
|
|
|
73
73
|
}
|
|
74
74
|
}
|
|
75
75
|
}`,
|
|
76
|
-
expectedRuleIds: [],
|
|
76
|
+
expectedRuleIds: ["SEC-001"],
|
|
77
77
|
category: "security",
|
|
78
78
|
difficulty: "medium",
|
|
79
79
|
},
|
|
@@ -166,7 +166,7 @@ app.delete("/items", async (req, res) => {
|
|
|
166
166
|
const result = await db.collection("items").deleteMany(filter);
|
|
167
167
|
res.json({ deleted: result.deletedCount });
|
|
168
168
|
});`,
|
|
169
|
-
expectedRuleIds: [],
|
|
169
|
+
expectedRuleIds: ["SEC-001"],
|
|
170
170
|
category: "injection",
|
|
171
171
|
difficulty: "medium",
|
|
172
172
|
},
|
|
@@ -325,7 +325,7 @@ public class ImportController : ControllerBase
|
|
|
325
325
|
return Ok(obj.ToString());
|
|
326
326
|
}
|
|
327
327
|
}`,
|
|
328
|
-
expectedRuleIds: [],
|
|
328
|
+
expectedRuleIds: ["SEC-001"],
|
|
329
329
|
category: "security",
|
|
330
330
|
difficulty: "medium",
|
|
331
331
|
},
|
|
@@ -346,7 +346,7 @@ def parse_xml():
|
|
|
346
346
|
tree = ET.parse(request.stream)
|
|
347
347
|
root = tree.getroot()
|
|
348
348
|
return root.tag`,
|
|
349
|
-
expectedRuleIds: [],
|
|
349
|
+
expectedRuleIds: ["SEC-001"],
|
|
350
350
|
category: "security",
|
|
351
351
|
difficulty: "medium",
|
|
352
352
|
},
|
|
@@ -491,7 +491,7 @@ CORS(app, origins="*", supports_credentials=True)
|
|
|
491
491
|
@app.route('/api/profile')
|
|
492
492
|
def profile():
|
|
493
493
|
return {"email": "user@example.com"}`,
|
|
494
|
-
expectedRuleIds: [],
|
|
494
|
+
expectedRuleIds: ["SEC-001"],
|
|
495
495
|
category: "security",
|
|
496
496
|
difficulty: "easy",
|
|
497
497
|
},
|
|
@@ -527,7 +527,7 @@ function encrypt(data: string): string {
|
|
|
527
527
|
encrypted += cipher.final("hex");
|
|
528
528
|
return encrypted;
|
|
529
529
|
}`,
|
|
530
|
-
expectedRuleIds: [],
|
|
530
|
+
expectedRuleIds: ["SEC-001"],
|
|
531
531
|
category: "security",
|
|
532
532
|
difficulty: "hard",
|
|
533
533
|
},
|
|
@@ -922,7 +922,7 @@ func searchHandler(w http.ResponseWriter, r *http.Request) {
|
|
|
922
922
|
render json: @products
|
|
923
923
|
end
|
|
924
924
|
end`,
|
|
925
|
-
expectedRuleIds: [],
|
|
925
|
+
expectedRuleIds: ["SEC-001"],
|
|
926
926
|
category: "injection",
|
|
927
927
|
difficulty: "easy",
|
|
928
928
|
},
|
|
@@ -948,7 +948,7 @@ def ping():
|
|
|
948
948
|
text=True
|
|
949
949
|
)
|
|
950
950
|
return result.stdout`,
|
|
951
|
-
expectedRuleIds: [],
|
|
951
|
+
expectedRuleIds: ["SEC-001"],
|
|
952
952
|
category: "injection",
|
|
953
953
|
difficulty: "easy",
|
|
954
954
|
},
|
|
@@ -1067,7 +1067,7 @@ func greetHandler(w http.ResponseWriter, r *http.Request) {
|
|
|
1067
1067
|
name := r.URL.Query().Get("name")
|
|
1068
1068
|
fmt.Fprintf(w, "<h1>Hello %s</h1>", name)
|
|
1069
1069
|
}`,
|
|
1070
|
-
expectedRuleIds: [],
|
|
1070
|
+
expectedRuleIds: ["SEC-001"],
|
|
1071
1071
|
category: "xss",
|
|
1072
1072
|
difficulty: "easy",
|
|
1073
1073
|
},
|
|
@@ -748,7 +748,7 @@ async function processImage(imageBuffer: Buffer) {
|
|
|
748
748
|
|
|
749
749
|
// Connection pool with excessive connections
|
|
750
750
|
const pool = new Pool({ host: "db.server.com", max: 500, idleTimeoutMillis: 0 });`,
|
|
751
|
-
expectedRuleIds: [],
|
|
751
|
+
expectedRuleIds: ["COST-001"],
|
|
752
752
|
category: "cost-effectiveness",
|
|
753
753
|
difficulty: "medium",
|
|
754
754
|
},
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM Benchmark Optimizer — Self-Teaching Feedback Loop
|
|
3
|
+
*
|
|
4
|
+
* Analyzes benchmark snapshots to identify systematic weaknesses
|
|
5
|
+
* (high-FP judges, problematic categories, difficulty gaps) and
|
|
6
|
+
* generates targeted prompt amendments that are applied on the
|
|
7
|
+
* next benchmark run to improve precision without sacrificing recall.
|
|
8
|
+
*
|
|
9
|
+
* Closed loop: run → analyze → amend prompts → run → better scores
|
|
10
|
+
*/
|
|
11
|
+
import type { LlmBenchmarkSnapshot } from "./llm-benchmark.js";
|
|
12
|
+
export interface PromptAmendment {
|
|
13
|
+
/** Judge rule prefix this amendment targets */
|
|
14
|
+
judgePrefix: string;
|
|
15
|
+
/** The amendment text to inject into prompts */
|
|
16
|
+
amendment: string;
|
|
17
|
+
/** Why this amendment was generated */
|
|
18
|
+
reason: string;
|
|
19
|
+
/** Historical FP rate that triggered this */
|
|
20
|
+
fpRate: number;
|
|
21
|
+
/** Benchmark run that generated this */
|
|
22
|
+
generatedFrom: string;
|
|
23
|
+
/** ISO timestamp */
|
|
24
|
+
timestamp: string;
|
|
25
|
+
}
|
|
26
|
+
export interface OptimizerInsight {
|
|
27
|
+
category: "high-fp-judge" | "missed-category" | "clean-case-leak" | "difficulty-gap";
|
|
28
|
+
severity: "critical" | "high" | "medium";
|
|
29
|
+
/** Target identifier (judge prefix, category name, difficulty) */
|
|
30
|
+
target: string;
|
|
31
|
+
/** The metric value (FP rate, F1, detection rate) */
|
|
32
|
+
metric: number;
|
|
33
|
+
/** Human-readable recommendation */
|
|
34
|
+
recommendation: string;
|
|
35
|
+
}
|
|
36
|
+
export interface OptimizationResult {
|
|
37
|
+
amendments: PromptAmendment[];
|
|
38
|
+
insights: OptimizerInsight[];
|
|
39
|
+
/** Estimated F1 improvement from applying amendments */
|
|
40
|
+
projectedF1Improvement: number;
|
|
41
|
+
/** Summary stats */
|
|
42
|
+
summary: {
|
|
43
|
+
worstJudges: string[];
|
|
44
|
+
worstCategories: string[];
|
|
45
|
+
amendmentsGenerated: number;
|
|
46
|
+
currentF1: number;
|
|
47
|
+
projectedF1: number;
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
export interface AmendmentStore {
|
|
51
|
+
/** Schema version */
|
|
52
|
+
version: 1;
|
|
53
|
+
/** Active amendments to apply on next run */
|
|
54
|
+
amendments: PromptAmendment[];
|
|
55
|
+
/** History of past optimizations */
|
|
56
|
+
history: Array<{
|
|
57
|
+
timestamp: string;
|
|
58
|
+
snapshotF1: number;
|
|
59
|
+
amendmentsApplied: number;
|
|
60
|
+
amendmentsGenerated: number;
|
|
61
|
+
}>;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Analyze a benchmark snapshot and produce optimization results.
|
|
65
|
+
* This is the main self-teaching entry point.
|
|
66
|
+
*/
|
|
67
|
+
export declare function optimizeBenchmark(snapshot: LlmBenchmarkSnapshot, existingAmendments?: PromptAmendment[]): OptimizationResult;
|
|
68
|
+
/**
|
|
69
|
+
* Format amendments as a prompt section to inject into tribunal/per-judge prompts.
|
|
70
|
+
* Returns empty string if no amendments.
|
|
71
|
+
*/
|
|
72
|
+
export declare function formatAmendmentSection(amendments: PromptAmendment[]): string;
|
|
73
|
+
export declare function createEmptyStore(): AmendmentStore;
|
|
74
|
+
/**
|
|
75
|
+
* Merge new amendments into existing store.
|
|
76
|
+
* Newer amendments for the same prefix replace older ones.
|
|
77
|
+
*/
|
|
78
|
+
export declare function mergeAmendments(store: AmendmentStore, result: OptimizationResult, snapshotF1: number): AmendmentStore;
|