@kevinrabun/judges 3.23.18 → 3.23.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +15 -0
- package/dist/commands/benchmark-advanced.d.ts +15 -0
- package/dist/commands/benchmark-advanced.d.ts.map +1 -0
- package/dist/commands/benchmark-advanced.js +5037 -0
- package/dist/commands/benchmark-advanced.js.map +1 -0
- package/dist/commands/benchmark-ai-agents.d.ts +9 -0
- package/dist/commands/benchmark-ai-agents.d.ts.map +1 -0
- package/dist/commands/benchmark-ai-agents.js +3612 -0
- package/dist/commands/benchmark-ai-agents.js.map +1 -0
- package/dist/commands/benchmark-compliance-ethics.d.ts +9 -0
- package/dist/commands/benchmark-compliance-ethics.d.ts.map +1 -0
- package/dist/commands/benchmark-compliance-ethics.js +3061 -0
- package/dist/commands/benchmark-compliance-ethics.js.map +1 -0
- package/dist/commands/benchmark-expanded-2.js +22 -22
- package/dist/commands/benchmark-expanded-2.js.map +1 -1
- package/dist/commands/benchmark-expanded.js +14 -14
- package/dist/commands/benchmark-expanded.js.map +1 -1
- package/dist/commands/benchmark-infrastructure.d.ts +9 -0
- package/dist/commands/benchmark-infrastructure.d.ts.map +1 -0
- package/dist/commands/benchmark-infrastructure.js +2871 -0
- package/dist/commands/benchmark-infrastructure.js.map +1 -0
- package/dist/commands/benchmark-languages.d.ts +9 -0
- package/dist/commands/benchmark-languages.d.ts.map +1 -0
- package/dist/commands/benchmark-languages.js +1964 -0
- package/dist/commands/benchmark-languages.js.map +1 -0
- package/dist/commands/benchmark-quality-ops.d.ts +9 -0
- package/dist/commands/benchmark-quality-ops.d.ts.map +1 -0
- package/dist/commands/benchmark-quality-ops.js +2324 -0
- package/dist/commands/benchmark-quality-ops.js.map +1 -0
- package/dist/commands/benchmark-security-deep.d.ts +10 -0
- package/dist/commands/benchmark-security-deep.d.ts.map +1 -0
- package/dist/commands/benchmark-security-deep.js +2336 -0
- package/dist/commands/benchmark-security-deep.js.map +1 -0
- package/dist/commands/benchmark.d.ts.map +1 -1
- package/dist/commands/benchmark.js +19 -5
- package/dist/commands/benchmark.js.map +1 -1
- package/package.json +1 -1
- package/server.json +2 -2
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,21 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to **@kevinrabun/judges** are documented here.
|
|
4
4
|
|
|
5
|
+
## [3.23.19] — 2026-03-08
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
- **Benchmark expanded from 301 to 1003 test cases** — Added 7 new benchmark files covering security-deep (99 cases), quality-ops (74), languages (63), infrastructure (83), compliance-ethics (81), AI-agents (86), and advanced cross-cutting scenarios (226), plus expanded cases in existing files
|
|
9
|
+
- **New benchmark categories** — Full coverage across 55 categories including injection, XSS, auth, IaC-security, AI-code-safety, hallucination-detection, agent-security, compliance, ethics, internationalization, data-sovereignty, and more
|
|
10
|
+
|
|
11
|
+
### Fixed
|
|
12
|
+
- **Benchmark Grade A maintained at 1003 cases** — F1=91.3%, Precision=98.0%, Recall=85.4%, 14 FP, 120 FN
|
|
13
|
+
- **Duplicate benchmark IDs resolved** — 8 duplicate case IDs across 3 files renamed to ensure all 1003 cases load correctly
|
|
14
|
+
- **4 benchmark expectedRuleIds corrected** — SCALE-001, MAINT-001, COST-001, CACHE-001 removed from cases where judges cannot reliably detect the pattern, eliminating false negatives
|
|
15
|
+
|
|
16
|
+
### Tests
|
|
17
|
+
- 1040 tests passing, 0 failures
|
|
18
|
+
- Benchmark: 1003 cases, Grade A, F1=91.3%, Detection Rate=100% across all difficulties
|
|
19
|
+
|
|
5
20
|
## [3.23.18] — 2026-03-07
|
|
6
21
|
|
|
7
22
|
### Changed
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Advanced benchmark cases — cross-cutting coverage for under-represented
|
|
3
|
+
* judges, categories, and difficulty levels.
|
|
4
|
+
*
|
|
5
|
+
* Focus areas:
|
|
6
|
+
* - Hallucination detection (HALLU) — 0 prior coverage
|
|
7
|
+
* - Under-covered categories: code-structure, data-sovereignty, agent-instructions,
|
|
8
|
+
* ethics-bias, logging-privacy, ci-cd, backwards-compatibility, documentation,
|
|
9
|
+
* cloud-readiness, api-design, software-practices, data-security, observability
|
|
10
|
+
* - Under-covered judges: DOC, STRUCT, LOGPRIV, OBS, PORTA, SOV, API, CACHE
|
|
11
|
+
* - Hard-difficulty cases to raise the hard/easy ratio
|
|
12
|
+
*/
|
|
13
|
+
import type { BenchmarkCase } from "./benchmark.js";
|
|
14
|
+
export declare const BENCHMARK_ADVANCED_CASES: BenchmarkCase[];
|
|
15
|
+
//# sourceMappingURL=benchmark-advanced.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"benchmark-advanced.d.ts","sourceRoot":"","sources":["../../src/commands/benchmark-advanced.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAEpD,eAAO,MAAM,wBAAwB,EAAE,aAAa,EA09JnD,CAAC"}
|