agent-eval-harness 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +41 -0
- package/LICENSE +21 -0
- package/README.md +95 -0
- package/cli.js +212 -0
- package/index.js +34 -0
- package/init/_evals/cases.jsonl +2 -0
- package/init/_evals/fixtures/sample-agent.txt +3 -0
- package/init/_evals/schemas.json +14 -0
- package/init/agent-eval.config.json +10 -0
- package/init/agents/sample-agent.md +30 -0
- package/lib/agent-loader.js +47 -0
- package/lib/config.js +60 -0
- package/lib/extract-json.js +79 -0
- package/lib/text.js +87 -0
- package/package.json +51 -0
- package/suites/routing.js +55 -0
- package/suites/schema.js +48 -0
- package/suites/spawn.js +94 -0
- package/suites/static.js +45 -0
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to `agent-eval-harness` documented here.
|
|
4
|
+
Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|
5
|
+
Version policy: [SemVer](https://semver.org/) — major for breaking config/CLI changes, minor for new suites or check types, patch for bug fixes and tightened audits.
|
|
6
|
+
|
|
7
|
+
## [0.1.0] — 2026-05-09
|
|
8
|
+
|
|
9
|
+
Initial extraction from the in-tree harness at `~/.claude/agents/_evals/`.
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
|
|
13
|
+
- Library API at `index.js` — `loadConfig`, `loadAgents`, `loadCases`, `staticSuite`, `schemaSuite`, `fenceAudit`, `routingSuite`, `overlapAudit`, `spawnSuite`, `extractJson`, plus `text` helpers (stem, tokenize, jaccard, IDF, weighted-recall).
|
|
14
|
+
- CLI binary at `cli.js` — `agent-eval [--config=<path>] [--threshold=<n>] [--strict] [--static|--schema|--routing|--spawn|--all] [--verbose] [--json] [--init]`.
|
|
15
|
+
- Config loader at `lib/config.js` — JSON config with env-var override (`AGENT_EVAL_CONFIG`), defaults baked in, paths resolved relative to config file.
|
|
16
|
+
- `--init` scaffolder copying a working sample project (1 agent, 1 schema, 2 routing cases, 1 fixture) to cwd. Sample passes `--threshold=1.0`.
|
|
17
|
+
- README.md with quick-start, suite documentation, schema/case/fixture file shapes, and exit-code table.
|
|
18
|
+
|
|
19
|
+
### Suites
|
|
20
|
+
|
|
21
|
+
- **Static** (8 checks/agent): name, description length, tools count, tools whitelist, filename match, trigger phrasing, scope discipline, no recursion.
|
|
22
|
+
- **Schema** (1 check/agent + fence audit): return contract section + JSON shape declaration; flags ` ```json ` fences inside Return Contract sections (provoke mimicry on some LLMs).
|
|
23
|
+
- **Routing** (N cases + overlap audit): IDF-weighted recall ranks the right agent first with positive margin; flags description pairs ≥0.20 Jaccard.
|
|
24
|
+
- **Spawn** (M schema-bound agents): fixture parses as JSON, required fields present, types match, enums valid.
|
|
25
|
+
|
|
26
|
+
### Strict mode
|
|
27
|
+
|
|
28
|
+
`--strict` and `--threshold>=1.0` promote informational fence/overlap/no-fixture audits to blocking score-affecting checks.
|
|
29
|
+
|
|
30
|
+
### Known limitations
|
|
31
|
+
|
|
32
|
+
- Spawn suite is a recorded-fixture contract test, not a live spawn — Node can't invoke the Claude Code Task tool from a script. Fixtures must be captured manually (or by a separate spawn pipeline) and dropped into `fixtures/<name>.txt`.
|
|
33
|
+
- Stemmer is English-only. Non-English agent descriptions need a custom tokenizer.
|
|
34
|
+
- IDF is computed over agent descriptions, not a real corpus — small-N effects above ~30 agents may shift token weights.
|
|
35
|
+
|
|
36
|
+
## [Unreleased]
|
|
37
|
+
|
|
38
|
+
Planned:
|
|
39
|
+
- Live-spawn integration so the spawn suite can regenerate fixtures itself when an `--update-fixtures` flag is set (requires harness-side runner — out of scope for v0.1).
|
|
40
|
+
- Pluggable tokenizer for non-English agent definitions.
|
|
41
|
+
- Per-suite threshold (e.g. accept 95% routing + 100% static).
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Yorkis Estevez
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# agent-eval-harness
|
|
2
|
+
|
|
3
|
+
Static + schema + routing + spawn-fixture eval harness for `*.md` subagent definitions.
|
|
4
|
+
|
|
5
|
+
Drop a directory of agents (Claude Code subagents, or any markdown-frontmatter agents) into a project, point the harness at it, and get a 100-point lint that catches description bloat, scope drift, fence-mimicry traps, low routing margin, and schema regressions before they ship.
|
|
6
|
+
|
|
7
|
+
## Quick start
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
# In a fresh directory
|
|
11
|
+
node /path/to/agent-eval-harness/cli.js --init
|
|
12
|
+
node /path/to/agent-eval-harness/cli.js --threshold=1.0
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
`--init` scaffolds `agents/`, `_evals/`, and an `agent-eval.config.json`. The sample agent passes 100% out of the box — copy its shape for your own.
|
|
16
|
+
|
|
17
|
+
## What it checks
|
|
18
|
+
|
|
19
|
+
| Suite | Per-agent | What it catches |
|
|
20
|
+
|---|---|---|
|
|
21
|
+
| **static** | 8 checks | missing name/description, too many tools, invalid tool names, file-name mismatch, missing trigger ("use when..."), no Scope/Hard-rules section, agent calls itself recursively |
|
|
22
|
+
| **schema** | 1 check + fence audit | no Return Contract section, no JSON shape declared, ` ```json ` fence inside Return Contract (provokes mimicry in some LLMs) |
|
|
23
|
+
| **routing** | N cases + overlap audit | wrong agent ranks first for a given prompt, zero margin between top two, two descriptions overlap ≥0.20 Jaccard |
|
|
24
|
+
| **spawn** | M schema-bound agents | fixture in `fixtures/<name>.txt` doesn't parse as JSON, missing required fields, type mismatches, enum violations |
|
|
25
|
+
|
|
26
|
+
Strict mode (`--strict` or `--threshold=1.0`) promotes the fence audit, overlap audit, and missing-fixture from informational to blocking.
|
|
27
|
+
|
|
28
|
+
## Config
|
|
29
|
+
|
|
30
|
+
`agent-eval.config.json` — paths and thresholds. Resolved relative to the config file's directory:
|
|
31
|
+
|
|
32
|
+
```json
|
|
33
|
+
{
|
|
34
|
+
"agentSourceDir": "./agents",
|
|
35
|
+
"fixturesDir": "./_evals/fixtures",
|
|
36
|
+
"schemasFile": "./_evals/schemas.json",
|
|
37
|
+
"casesFile": "./_evals/cases.jsonl",
|
|
38
|
+
"validTools": ["Read", "Write", "Edit", "Bash", "Grep", "Glob", "WebSearch", "WebFetch", "NotebookEdit", "Task"],
|
|
39
|
+
"minDescriptionChars": 40,
|
|
40
|
+
"maxTools": 5,
|
|
41
|
+
"defaultThreshold": 0.85
|
|
42
|
+
}
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Override paths via `--config=path/to/cfg.json` or `AGENT_EVAL_CONFIG` env var.
|
|
46
|
+
|
|
47
|
+
## Library use
|
|
48
|
+
|
|
49
|
+
```js
|
|
50
|
+
const { loadConfig, loadAgents, staticSuite, schemaSuite, routingSuite, spawnSuite } = require('agent-eval-harness');
|
|
51
|
+
|
|
52
|
+
const config = loadConfig({ configPath: './agent-eval.config.json' });
|
|
53
|
+
const agents = loadAgents(config.agentSourceDir);
|
|
54
|
+
const results = staticSuite(agents, config);
|
|
55
|
+
// ... render however you want
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Schema file shape
|
|
59
|
+
|
|
60
|
+
```json
|
|
61
|
+
{
|
|
62
|
+
"<agent-name>": {
|
|
63
|
+
"required": ["field1", "field2"],
|
|
64
|
+
"types": { "field1": "string", "field2": "number" },
|
|
65
|
+
"enums": { "field1": ["ok", "error"] },
|
|
66
|
+
"nested": {
|
|
67
|
+
"field2": { "required": ["sub1"], "types": { "sub1": "string" } }
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Cases file shape (cases.jsonl)
|
|
74
|
+
|
|
75
|
+
One JSON object per line:
|
|
76
|
+
|
|
77
|
+
```jsonl
|
|
78
|
+
{"id":"case-1","prompt":"some user prompt","expect_agent":"agent-name"}
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Fixture file shape
|
|
82
|
+
|
|
83
|
+
`fixtures/<agent-name>.txt` — a real recorded response from spawning the agent. Can include surrounding prose or fences; the harness extracts the JSON. Aim for one fixture per schema-bound agent.
|
|
84
|
+
|
|
85
|
+
## Exit codes
|
|
86
|
+
|
|
87
|
+
| Code | Meaning |
|
|
88
|
+
|---|---|
|
|
89
|
+
| 0 | Score ≥ threshold |
|
|
90
|
+
| 1 | Runtime error (config missing, file not found) |
|
|
91
|
+
| 2 | Score below threshold |
|
|
92
|
+
|
|
93
|
+
## License
|
|
94
|
+
|
|
95
|
+
MIT.
|
package/cli.js
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
// agent-eval CLI.
|
|
5
|
+
//
|
|
6
|
+
// agent-eval # all suites against ./agent-eval.config.json
|
|
7
|
+
// agent-eval --config=path/to/cfg.json
|
|
8
|
+
// agent-eval --static # only the static suite
|
|
9
|
+
// agent-eval --routing # only routing
|
|
10
|
+
// agent-eval --schema # only schema (+ fence audit)
|
|
11
|
+
// agent-eval --spawn # only spawn-fixture
|
|
12
|
+
// agent-eval --threshold=1.0 # auto-on --strict at 1.0
|
|
13
|
+
// agent-eval --strict # promote informational audits to blocking
|
|
14
|
+
// agent-eval --verbose
|
|
15
|
+
// agent-eval --json # also emit JSON failure dump
|
|
16
|
+
// agent-eval --init # scaffold a sample project in cwd
|
|
17
|
+
//
|
|
18
|
+
// Exit 0 if score >= threshold (default 0.85), else exit 2.
|
|
19
|
+
|
|
20
|
+
const fs = require('node:fs');
|
|
21
|
+
const path = require('node:path');
|
|
22
|
+
const {
|
|
23
|
+
loadConfig, loadAgents, loadCases,
|
|
24
|
+
staticSuite, schemaSuite, fenceAudit,
|
|
25
|
+
routingSuite, overlapAudit,
|
|
26
|
+
spawnSuite,
|
|
27
|
+
} = require('./index');
|
|
28
|
+
|
|
29
|
+
function parseArgs(argv) {
|
|
30
|
+
const args = argv.slice(2);
|
|
31
|
+
const opts = {
|
|
32
|
+
configPath: null,
|
|
33
|
+
threshold: null,
|
|
34
|
+
verbose: args.includes('--verbose'),
|
|
35
|
+
json: args.includes('--json'),
|
|
36
|
+
strict: args.includes('--strict'),
|
|
37
|
+
init: args.includes('--init'),
|
|
38
|
+
suites: [],
|
|
39
|
+
};
|
|
40
|
+
for (const a of args) {
|
|
41
|
+
if (a.startsWith('--config=')) opts.configPath = a.slice('--config='.length);
|
|
42
|
+
else if (a.startsWith('--threshold=')) opts.threshold = parseFloat(a.slice('--threshold='.length));
|
|
43
|
+
else if (a === '--static') opts.suites.push('static');
|
|
44
|
+
else if (a === '--schema') opts.suites.push('schema');
|
|
45
|
+
else if (a === '--routing') opts.suites.push('routing');
|
|
46
|
+
else if (a === '--spawn') opts.suites.push('spawn');
|
|
47
|
+
else if (a === '--all') opts.suites = ['static', 'schema', 'routing', 'spawn'];
|
|
48
|
+
else if (a === '--help' || a === '-h') opts.help = true;
|
|
49
|
+
}
|
|
50
|
+
if (opts.suites.length === 0) opts.suites = ['static', 'schema', 'routing', 'spawn'];
|
|
51
|
+
return opts;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function help() {
|
|
55
|
+
process.stdout.write([
|
|
56
|
+
'agent-eval — static + schema + routing + spawn-fixture eval for *.md subagents',
|
|
57
|
+
'',
|
|
58
|
+
'Usage:',
|
|
59
|
+
' agent-eval # all suites',
|
|
60
|
+
' agent-eval --config=path.json # explicit config file',
|
|
61
|
+
' agent-eval --static # one suite',
|
|
62
|
+
' agent-eval --threshold=1.0 # auto-strict at 1.0',
|
|
63
|
+
' agent-eval --init # scaffold a sample project',
|
|
64
|
+
'',
|
|
65
|
+
'Config: looks for ./agent-eval.config.json by default.',
|
|
66
|
+
'Exit codes: 0=score>=threshold, 2=below threshold, 1=runtime error',
|
|
67
|
+
'',
|
|
68
|
+
].join('\n'));
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function runInit(cwd = process.cwd()) {
|
|
72
|
+
const src = path.join(__dirname, 'init');
|
|
73
|
+
const targets = ['agent-eval.config.json', 'agents/sample-agent.md', '_evals/cases.jsonl', '_evals/schemas.json', '_evals/fixtures/sample-agent.txt'];
|
|
74
|
+
for (const rel of targets) {
|
|
75
|
+
const from = path.join(src, rel);
|
|
76
|
+
const to = path.join(cwd, rel);
|
|
77
|
+
if (fs.existsSync(to)) {
|
|
78
|
+
console.log(` exists ${rel} (skipped)`);
|
|
79
|
+
continue;
|
|
80
|
+
}
|
|
81
|
+
fs.mkdirSync(path.dirname(to), { recursive: true });
|
|
82
|
+
fs.copyFileSync(from, to);
|
|
83
|
+
console.log(` created ${rel}`);
|
|
84
|
+
}
|
|
85
|
+
console.log('\nNext: run `agent-eval --threshold=1.0` to verify the sample passes.');
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function main() {
|
|
89
|
+
const opts = parseArgs(process.argv);
|
|
90
|
+
if (opts.help) return help();
|
|
91
|
+
if (opts.init) return runInit();
|
|
92
|
+
|
|
93
|
+
const config = loadConfig({ configPath: opts.configPath });
|
|
94
|
+
const threshold = opts.threshold ?? config.defaultThreshold ?? 0.85;
|
|
95
|
+
const strict = opts.strict || threshold >= 1.0;
|
|
96
|
+
const verbose = opts.verbose;
|
|
97
|
+
|
|
98
|
+
if (config.__configPath) {
|
|
99
|
+
console.log(`config: ${path.relative(process.cwd(), config.__configPath)}`);
|
|
100
|
+
} else {
|
|
101
|
+
console.log('config: (defaults — no agent-eval.config.json found)');
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const agents = loadAgents(config.agentSourceDir);
|
|
105
|
+
const cases = loadCases(config.casesFile);
|
|
106
|
+
|
|
107
|
+
let totalChecks = 0, totalPass = 0;
|
|
108
|
+
const failures = [];
|
|
109
|
+
|
|
110
|
+
if (opts.suites.includes('static')) {
|
|
111
|
+
console.log(`\n=== Static suite (${agents.length} agents) ===`);
|
|
112
|
+
const out = staticSuite(agents, config);
|
|
113
|
+
for (const r of out) {
|
|
114
|
+
const passes = r.checks.filter(c => c.pass).length;
|
|
115
|
+
const total = r.checks.length;
|
|
116
|
+
totalChecks += total;
|
|
117
|
+
totalPass += passes;
|
|
118
|
+
const fails = r.checks.filter(c => !c.pass);
|
|
119
|
+
if (fails.length === 0) {
|
|
120
|
+
if (verbose) console.log(` ✓ ${r.agent} (${passes}/${total})`);
|
|
121
|
+
} else {
|
|
122
|
+
console.log(` ✗ ${r.agent} (${passes}/${total}) — ${fails.map(c => c.name + (c.detail ? ':' + c.detail : '')).join(', ')}`);
|
|
123
|
+
failures.push({ suite: 'static', agent: r.agent, fails });
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (opts.suites.includes('schema')) {
|
|
129
|
+
console.log(`\n=== Schema suite (${agents.length} agents) ===`);
|
|
130
|
+
const out = schemaSuite(agents);
|
|
131
|
+
for (const r of out) {
|
|
132
|
+
totalChecks++;
|
|
133
|
+
if (r.pass) {
|
|
134
|
+
totalPass++;
|
|
135
|
+
if (verbose) console.log(` ✓ ${r.agent}`);
|
|
136
|
+
} else {
|
|
137
|
+
console.log(` ✗ ${r.agent} — ${r.detail}`);
|
|
138
|
+
failures.push({ suite: 'schema', agent: r.agent, detail: r.detail });
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
const fencers = fenceAudit(agents);
|
|
142
|
+
if (fencers.length) {
|
|
143
|
+
const tag = strict ? 'BLOCKING' : 'informational';
|
|
144
|
+
console.log(`\n=== Fence audit (${tag} — provokes \`\`\`json mimicry) ===`);
|
|
145
|
+
for (const name of fencers) console.log(` ${strict ? '✗' : '⚠'} ${name} uses a \`\`\`json fence in schema example`);
|
|
146
|
+
if (strict) {
|
|
147
|
+
totalChecks += fencers.length;
|
|
148
|
+
for (const name of fencers) failures.push({ suite: 'fence', agent: name });
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
if (opts.suites.includes('routing')) {
|
|
154
|
+
console.log(`\n=== Routing suite (${cases.length} cases) ===`);
|
|
155
|
+
const out = routingSuite(agents, cases);
|
|
156
|
+
for (const r of out) {
|
|
157
|
+
totalChecks++;
|
|
158
|
+
if (r.pass) {
|
|
159
|
+
totalPass++;
|
|
160
|
+
if (verbose) console.log(` ✓ ${r.id}: ${r.got} (margin ${r.margin.toFixed(3)})`);
|
|
161
|
+
} else {
|
|
162
|
+
const reason = r.expected !== r.got
|
|
163
|
+
? `wrong agent (got ${r.got}, wanted ${r.expected})`
|
|
164
|
+
: `low margin ${r.margin.toFixed(3)} vs ${r.runner_up}`;
|
|
165
|
+
console.log(` ✗ ${r.id} — ${reason}`);
|
|
166
|
+
failures.push({ suite: 'routing', id: r.id, expected: r.expected, got: r.got, margin: r.margin });
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
const pairs = overlapAudit(agents);
|
|
170
|
+
const overlapTag = strict ? 'BLOCKING' : 'informational';
|
|
171
|
+
console.log(`\n=== Description overlap (${overlapTag} — pairs ≥0.20 Jaccard) ===`);
|
|
172
|
+
if (pairs.length === 0) console.log(' no high-overlap pairs');
|
|
173
|
+
else for (const p of pairs) console.log(` ${strict ? '✗' : '⚠'} ${p.a} ↔ ${p.b} = ${p.score.toFixed(3)}`);
|
|
174
|
+
if (strict && pairs.length) {
|
|
175
|
+
totalChecks += pairs.length;
|
|
176
|
+
for (const p of pairs) failures.push({ suite: 'overlap', a: p.a, b: p.b, score: p.score });
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
if (opts.suites.includes('spawn')) {
|
|
181
|
+
const sp = spawnSuite(config, { strict });
|
|
182
|
+
console.log(`\n=== Spawn suite (${sp.coveredAgents}/${Object.keys(require(config.schemasFile)).length} agents covered) ===`);
|
|
183
|
+
for (const r of sp.results) {
|
|
184
|
+
if (r.pass) {
|
|
185
|
+
if (verbose) console.log(` ✓ ${r.agent}`);
|
|
186
|
+
} else {
|
|
187
|
+
const detail = r.reason === 'schema' ? r.errors.join('; ')
|
|
188
|
+
: r.reason === 'extract-json' ? r.detail
|
|
189
|
+
: r.reason;
|
|
190
|
+
console.log(` ✗ ${r.agent} — ${detail}`);
|
|
191
|
+
failures.push({ suite: 'spawn', agent: r.agent, reason: r.reason, detail });
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
if (sp.noFixture.length && !strict) {
|
|
195
|
+
console.log(` (no fixture: ${sp.noFixture.join(', ')})`);
|
|
196
|
+
}
|
|
197
|
+
totalChecks += sp.totalChecks;
|
|
198
|
+
totalPass += sp.totalPass;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const score = totalChecks ? totalPass / totalChecks : 0;
|
|
202
|
+
console.log(`\n=== Score: ${totalPass}/${totalChecks} = ${(score * 100).toFixed(1)}% ===`);
|
|
203
|
+
console.log(`Threshold: ${(threshold * 100).toFixed(0)}%`);
|
|
204
|
+
|
|
205
|
+
if (failures.length && opts.json) {
|
|
206
|
+
console.log('\n' + JSON.stringify({ score, totalPass, totalChecks, failures }, null, 2));
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
process.exit(score >= threshold ? 0 : 2);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
if (require.main === module) main();
|
package/index.js
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Public API for agent-eval-harness.
|
|
4
|
+
// Library callers (e.g. a thin run.js wrapper in your own repo) can import
|
|
5
|
+
// suites, supply a config, and render the results however they like.
|
|
6
|
+
|
|
7
|
+
const { loadConfig, DEFAULTS, DEFAULT_VALID_TOOLS } = require('./lib/config');
|
|
8
|
+
const { parseAgent, loadAgents, loadCases } = require('./lib/agent-loader');
|
|
9
|
+
const { extractJson } = require('./lib/extract-json');
|
|
10
|
+
const text = require('./lib/text');
|
|
11
|
+
const { staticSuite } = require('./suites/static');
|
|
12
|
+
const { schemaSuite, fenceAudit } = require('./suites/schema');
|
|
13
|
+
const { routingSuite, overlapAudit } = require('./suites/routing');
|
|
14
|
+
const { spawnSuite, validateAgainst, loadSchemas, loadFixture } = require('./suites/spawn');
|
|
15
|
+
|
|
16
|
+
module.exports = {
|
|
17
|
+
loadConfig,
|
|
18
|
+
DEFAULTS,
|
|
19
|
+
DEFAULT_VALID_TOOLS,
|
|
20
|
+
parseAgent,
|
|
21
|
+
loadAgents,
|
|
22
|
+
loadCases,
|
|
23
|
+
extractJson,
|
|
24
|
+
text,
|
|
25
|
+
staticSuite,
|
|
26
|
+
schemaSuite,
|
|
27
|
+
fenceAudit,
|
|
28
|
+
routingSuite,
|
|
29
|
+
overlapAudit,
|
|
30
|
+
spawnSuite,
|
|
31
|
+
validateAgainst,
|
|
32
|
+
loadSchemas,
|
|
33
|
+
loadFixture,
|
|
34
|
+
};
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
{
|
|
2
|
+
"sample-agent": {
|
|
3
|
+
"required": ["result", "widget_count", "by_type", "summary"],
|
|
4
|
+
"types": {
|
|
5
|
+
"result": "string",
|
|
6
|
+
"widget_count": "number",
|
|
7
|
+
"by_type": "object",
|
|
8
|
+
"summary": "string"
|
|
9
|
+
},
|
|
10
|
+
"enums": {
|
|
11
|
+
"result": ["ok", "rejected"]
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"agentSourceDir": "./agents",
|
|
3
|
+
"fixturesDir": "./_evals/fixtures",
|
|
4
|
+
"schemasFile": "./_evals/schemas.json",
|
|
5
|
+
"casesFile": "./_evals/cases.jsonl",
|
|
6
|
+
"validTools": ["Read", "Write", "Edit", "Bash", "Grep", "Glob", "WebSearch", "WebFetch", "NotebookEdit", "Task"],
|
|
7
|
+
"minDescriptionChars": 40,
|
|
8
|
+
"maxTools": 5,
|
|
9
|
+
"defaultThreshold": 0.85
|
|
10
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: sample-agent
|
|
3
|
+
description: Counts widgets in a hypothetical inventory dump and returns a JSON summary. Use when you need a quick, deterministic widget-count report for a small inventory dump (under 1000 lines). Not for live inventory queries (no DB access) and not for forecasting.
|
|
4
|
+
tools: Read, Grep
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
You are sample-agent. Given an inventory dump on stdin or in a file, count widgets by type and return a structured JSON summary.
|
|
8
|
+
|
|
9
|
+
## Scope
|
|
10
|
+
|
|
11
|
+
- IN: counting widgets in a static dump file under 1000 lines
|
|
12
|
+
- OUT: live inventory queries, forecasting, sales prediction, anything requiring writes
|
|
13
|
+
|
|
14
|
+
## Hard rules
|
|
15
|
+
|
|
16
|
+
- Never modify the dump file.
|
|
17
|
+
- Reject dumps over 1000 lines with `result: "rejected", reason: "too large"`.
|
|
18
|
+
|
|
19
|
+
## Return contract
|
|
20
|
+
|
|
21
|
+
Schema fields:
|
|
22
|
+
|
|
23
|
+
- `result` — one of `"ok"`, `"rejected"` (string)
|
|
24
|
+
- `widget_count` — total widgets across all types (number)
|
|
25
|
+
- `by_type` — object mapping widget type to count (object)
|
|
26
|
+
- `summary` — one-sentence human-readable summary (string)
|
|
27
|
+
|
|
28
|
+
Example (single line, no fence):
|
|
29
|
+
|
|
30
|
+
`{"result":"ok","widget_count":42,"by_type":{"red":10,"blue":32},"summary":"42 widgets across 2 types"}`
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Loads *.md agents from a directory and parses YAML-frontmatter + body.
|
|
4
|
+
// Agents must have a `---\n...\n---\n` frontmatter block at the top with at
|
|
5
|
+
// minimum a `name:` field. `description:` and `tools:` are commonly required
|
|
6
|
+
// downstream but missing fields are non-fatal here — the static suite reports
|
|
7
|
+
// them as failures.
|
|
8
|
+
|
|
9
|
+
const fs = require('node:fs');
|
|
10
|
+
const path = require('node:path');
|
|
11
|
+
|
|
12
|
+
function parseAgent(filePath) {
|
|
13
|
+
const raw = fs.readFileSync(filePath, 'utf8');
|
|
14
|
+
const m = raw.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
|
|
15
|
+
if (!m) return { error: 'no frontmatter', file: filePath };
|
|
16
|
+
const fm = {};
|
|
17
|
+
for (const line of m[1].split('\n')) {
|
|
18
|
+
const kv = line.match(/^(\w+):\s*(.+)$/);
|
|
19
|
+
if (kv) fm[kv[1]] = kv[2].trim();
|
|
20
|
+
}
|
|
21
|
+
return {
|
|
22
|
+
file: filePath,
|
|
23
|
+
name: fm.name,
|
|
24
|
+
description: fm.description || '',
|
|
25
|
+
tools: (fm.tools || '').split(',').map(t => t.trim()).filter(Boolean),
|
|
26
|
+
body: m[2],
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function loadAgents(agentsDir) {
|
|
31
|
+
if (!fs.existsSync(agentsDir)) {
|
|
32
|
+
throw new Error(`agentSourceDir not found: ${agentsDir}`);
|
|
33
|
+
}
|
|
34
|
+
return fs.readdirSync(agentsDir)
|
|
35
|
+
.filter(f => f.endsWith('.md'))
|
|
36
|
+
.map(f => parseAgent(path.join(agentsDir, f)))
|
|
37
|
+
.filter(a => !a.error);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function loadCases(casesFile) {
|
|
41
|
+
if (!casesFile || !fs.existsSync(casesFile)) return [];
|
|
42
|
+
return fs.readFileSync(casesFile, 'utf8')
|
|
43
|
+
.trim().split('\n').filter(Boolean)
|
|
44
|
+
.map(l => JSON.parse(l));
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
module.exports = { parseAgent, loadAgents, loadCases };
|
package/lib/config.js
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Loads agent-eval.config.json with sensible defaults and env-var overrides.
|
|
4
|
+
//
|
|
5
|
+
// Resolution order (highest precedence wins):
|
|
6
|
+
// 1. CLI --config=<path>
|
|
7
|
+
// 2. process.env.AGENT_EVAL_CONFIG
|
|
8
|
+
// 3. ./agent-eval.config.json in cwd
|
|
9
|
+
// 4. Built-in defaults
|
|
10
|
+
//
|
|
11
|
+
// Paths in the config are resolved relative to the config file's directory
|
|
12
|
+
// (or cwd if no file was loaded), so a config can be checked into a repo and
|
|
13
|
+
// stay valid wherever the eval is run from.
|
|
14
|
+
|
|
15
|
+
const fs = require('node:fs');
|
|
16
|
+
const path = require('node:path');
|
|
17
|
+
|
|
18
|
+
const DEFAULT_VALID_TOOLS = [
|
|
19
|
+
'Read', 'Write', 'Edit', 'Bash', 'Grep', 'Glob',
|
|
20
|
+
'WebSearch', 'WebFetch', 'NotebookEdit', 'Task',
|
|
21
|
+
];
|
|
22
|
+
|
|
23
|
+
const DEFAULTS = {
|
|
24
|
+
agentSourceDir: './agents',
|
|
25
|
+
fixturesDir: './_evals/fixtures',
|
|
26
|
+
schemasFile: './_evals/schemas.json',
|
|
27
|
+
casesFile: './_evals/cases.jsonl',
|
|
28
|
+
validTools: DEFAULT_VALID_TOOLS,
|
|
29
|
+
scopeSectionPattern: '## (Hard rules|Scope|When to refuse)',
|
|
30
|
+
triggerPattern: "\\b[Uu]se\\b(?:\\s+(?:this|proactively|always|only|never|right))?\\s+(?:when|whenever|before|after|during|on)\\b",
|
|
31
|
+
minDescriptionChars: 40,
|
|
32
|
+
maxTools: 5,
|
|
33
|
+
defaultThreshold: 0.85,
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
function loadConfig({ configPath, cwd = process.cwd() } = {}) {
|
|
37
|
+
const explicit = configPath || process.env.AGENT_EVAL_CONFIG;
|
|
38
|
+
const candidate = explicit
|
|
39
|
+
? path.resolve(cwd, explicit)
|
|
40
|
+
: path.resolve(cwd, 'agent-eval.config.json');
|
|
41
|
+
|
|
42
|
+
let loaded = {};
|
|
43
|
+
let baseDir = cwd;
|
|
44
|
+
if (fs.existsSync(candidate)) {
|
|
45
|
+
loaded = JSON.parse(fs.readFileSync(candidate, 'utf8'));
|
|
46
|
+
baseDir = path.dirname(candidate);
|
|
47
|
+
} else if (explicit) {
|
|
48
|
+
throw new Error(`config file not found: ${candidate}`);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const merged = { ...DEFAULTS, ...loaded };
|
|
52
|
+
for (const key of ['agentSourceDir', 'fixturesDir', 'schemasFile', 'casesFile']) {
|
|
53
|
+
if (merged[key]) merged[key] = path.resolve(baseDir, merged[key]);
|
|
54
|
+
}
|
|
55
|
+
merged.__configPath = fs.existsSync(candidate) ? candidate : null;
|
|
56
|
+
merged.__baseDir = baseDir;
|
|
57
|
+
return merged;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
module.exports = { loadConfig, DEFAULTS, DEFAULT_VALID_TOOLS };
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Extracts the canonical JSON object from a subagent response.
|
|
4
|
+
//
|
|
5
|
+
// Subagents are instructed to emit raw JSON only, but the underlying LLMs
|
|
6
|
+
// sometimes still wrap the JSON in markdown fences or add explanatory prose
|
|
7
|
+
// around it. Callers should run this defensively rather than calling
|
|
8
|
+
// JSON.parse directly.
|
|
9
|
+
//
|
|
10
|
+
// Strategy: find the largest balanced {...} block in the response and parse
|
|
11
|
+
// it. We balance braces while ignoring brace-like characters inside strings.
|
|
12
|
+
|
|
13
|
+
function extractJson(text) {
|
|
14
|
+
if (typeof text !== 'string') {
|
|
15
|
+
throw new TypeError('extractJson expects a string');
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
// Fast path: response is already pure JSON.
|
|
19
|
+
const trimmed = text.trim();
|
|
20
|
+
if (trimmed.startsWith('{') && trimmed.endsWith('}')) {
|
|
21
|
+
try { return JSON.parse(trimmed); } catch (_) { /* fall through */ }
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// Strip a single ```json ... ``` fence if present (common LLM habit).
|
|
25
|
+
const fence = trimmed.match(/```(?:json)?\s*\n([\s\S]*?)\n```/i);
|
|
26
|
+
if (fence) {
|
|
27
|
+
try { return JSON.parse(fence[1].trim()); } catch (_) { /* fall through */ }
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// General case: scan for the first top-level balanced {...} block.
|
|
31
|
+
const start = text.indexOf('{');
|
|
32
|
+
if (start === -1) throw new Error('extractJson: no `{` in response');
|
|
33
|
+
|
|
34
|
+
let depth = 0;
|
|
35
|
+
let inString = false;
|
|
36
|
+
let escape = false;
|
|
37
|
+
for (let i = start; i < text.length; i++) {
|
|
38
|
+
const ch = text[i];
|
|
39
|
+
if (escape) { escape = false; continue; }
|
|
40
|
+
if (ch === '\\' && inString) { escape = true; continue; }
|
|
41
|
+
if (ch === '"') { inString = !inString; continue; }
|
|
42
|
+
if (inString) continue;
|
|
43
|
+
if (ch === '{') depth++;
|
|
44
|
+
else if (ch === '}') {
|
|
45
|
+
depth--;
|
|
46
|
+
if (depth === 0) {
|
|
47
|
+
const candidate = text.slice(start, i + 1);
|
|
48
|
+
return JSON.parse(candidate);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
throw new Error('extractJson: unbalanced braces');
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
module.exports = { extractJson };
|
|
56
|
+
|
|
57
|
+
if (require.main === module) {
|
|
58
|
+
const cases = [
|
|
59
|
+
['{"a":1}', { a: 1 }],
|
|
60
|
+
[' {"a":1} ', { a: 1 }],
|
|
61
|
+
['Here you go:\n```json\n{"a":1}\n```\nDone.', { a: 1 }],
|
|
62
|
+
['noise\n{"nested":{"b":2}}\nmore noise', { nested: { b: 2 } }],
|
|
63
|
+
['{"s":"with } brace inside"}', { s: 'with } brace inside' }],
|
|
64
|
+
['prose {"a":1} trailing', { a: 1 }],
|
|
65
|
+
];
|
|
66
|
+
let pass = 0;
|
|
67
|
+
for (const [input, want] of cases) {
|
|
68
|
+
try {
|
|
69
|
+
const got = extractJson(input);
|
|
70
|
+
const ok = JSON.stringify(got) === JSON.stringify(want);
|
|
71
|
+
if (ok) pass++;
|
|
72
|
+
else console.log('FAIL:', JSON.stringify(input), '→', JSON.stringify(got));
|
|
73
|
+
} catch (e) {
|
|
74
|
+
console.log('FAIL:', JSON.stringify(input), '→', e.message);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
console.log(`extract-json self-test: ${pass}/${cases.length}`);
|
|
78
|
+
process.exit(pass === cases.length ? 0 : 1);
|
|
79
|
+
}
|
package/lib/text.js
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Tokenization, stemming, Jaccard, IDF, weighted-recall — language-agnostic
|
|
4
|
+
// helpers used by the routing and overlap suites. No filesystem access here;
|
|
5
|
+
// callers pass in plain strings and lists of agents.
|
|
6
|
+
|
|
7
|
+
const DEFAULT_STOPWORDS = new Set([
|
|
8
|
+
'a','an','the','is','are','was','were','be','been','being','do','does','did',
|
|
9
|
+
'have','has','had','this','that','these','those','of','in','on','to','for',
|
|
10
|
+
'with','at','by','from','about','as','it','its','and','or','but','if','then',
|
|
11
|
+
'use','when','i','you','your','my','me','we','us','our','make','need','want',
|
|
12
|
+
'show','give','get','asked','ask','please','can','could','would','should','will',
|
|
13
|
+
'one','any','some','all','no','not','only','also','just','still','very','more',
|
|
14
|
+
]);
|
|
15
|
+
|
|
16
|
+
// Pragmatic English morphology stemmer: ies→y, ing→drop+undouble, ed→drop,
|
|
17
|
+
// es→drop with sxzh/o exception, s→drop except ss. Approximates what an LLM
|
|
18
|
+
// classifier handles implicitly.
|
|
19
|
+
function stem(w) {
|
|
20
|
+
if (w.length > 5 && w.endsWith('ies')) return w.slice(0, -3) + 'y';
|
|
21
|
+
if (w.length > 5 && w.endsWith('ing')) {
|
|
22
|
+
let s = w.slice(0, -3);
|
|
23
|
+
if (s.length > 2 && s[s.length - 1] === s[s.length - 2] && !'aeiou'.includes(s[s.length - 1])) {
|
|
24
|
+
s = s.slice(0, -1);
|
|
25
|
+
}
|
|
26
|
+
return s;
|
|
27
|
+
}
|
|
28
|
+
if (w.length > 4 && w.endsWith('ed')) return w.slice(0, -2);
|
|
29
|
+
if (w.length > 4 && w.endsWith('es')) {
|
|
30
|
+
const prev = w[w.length - 3];
|
|
31
|
+
if ('sxzh'.includes(prev) || prev === 'o') return w.slice(0, -2);
|
|
32
|
+
return w.slice(0, -1);
|
|
33
|
+
}
|
|
34
|
+
if (w.length > 3 && w.endsWith('s') && !w.endsWith('ss')) return w.slice(0, -1);
|
|
35
|
+
return w;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function tokenize(s, stopwords = DEFAULT_STOPWORDS) {
|
|
39
|
+
return (s || '')
|
|
40
|
+
.toLowerCase()
|
|
41
|
+
.replace(/[^a-z0-9 ]+/g, ' ')
|
|
42
|
+
.split(/\s+/)
|
|
43
|
+
.filter(w => w.length > 2 && !stopwords.has(w))
|
|
44
|
+
.map(stem);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function jaccard(a, b) {
|
|
48
|
+
const sa = new Set(a);
|
|
49
|
+
const sb = new Set(b);
|
|
50
|
+
let inter = 0;
|
|
51
|
+
for (const x of sa) if (sb.has(x)) inter++;
|
|
52
|
+
const uni = sa.size + sb.size - inter;
|
|
53
|
+
return uni === 0 ? 0 : inter / uni;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// IDF over a corpus of agent descriptions. Tokens appearing in many agents are
|
|
57
|
+
// common (low signal); rare tokens are discriminating (high signal).
|
|
58
|
+
function buildIDF(agents) {
|
|
59
|
+
const N = agents.length;
|
|
60
|
+
const df = new Map();
|
|
61
|
+
for (const a of agents) {
|
|
62
|
+
const seen = new Set(tokenize(a.description));
|
|
63
|
+
for (const t of seen) df.set(t, (df.get(t) || 0) + 1);
|
|
64
|
+
}
|
|
65
|
+
const idf = new Map();
|
|
66
|
+
for (const [t, count] of df) {
|
|
67
|
+
idf.set(t, Math.log((N + 1) / (count + 1)) + 1);
|
|
68
|
+
}
|
|
69
|
+
return idf;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Of the prompt's discriminative tokens (IDF-weighted), what fraction does the
|
|
73
|
+
// agent description cover? Better proxy for "which agent fits this prompt"
|
|
74
|
+
// than plain Jaccard, because long descriptions aren't penalized for length.
|
|
75
|
+
function weightedRecall(promptTokens, agentTokens, idf) {
|
|
76
|
+
const sa = new Set(promptTokens);
|
|
77
|
+
const sb = new Set(agentTokens);
|
|
78
|
+
let interW = 0, promptW = 0;
|
|
79
|
+
for (const t of sa) {
|
|
80
|
+
const w = idf.get(t) ?? 1;
|
|
81
|
+
promptW += w;
|
|
82
|
+
if (sb.has(t)) interW += w;
|
|
83
|
+
}
|
|
84
|
+
return promptW === 0 ? 0 : interW / promptW;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
module.exports = { stem, tokenize, jaccard, buildIDF, weightedRecall, DEFAULT_STOPWORDS };
|
package/package.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "agent-eval-harness",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Static + schema + routing + spawn-fixture eval harness for *.md subagents (Claude Code, etc.). Catches description bloat, fence-mimicry, low routing margin, and schema regressions before they ship.",
|
|
5
|
+
"main": "index.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"agent-eval": "./cli.js"
|
|
8
|
+
},
|
|
9
|
+
"files": [
|
|
10
|
+
"index.js",
|
|
11
|
+
"cli.js",
|
|
12
|
+
"lib/",
|
|
13
|
+
"suites/",
|
|
14
|
+
"init/",
|
|
15
|
+
"README.md",
|
|
16
|
+
"CHANGELOG.md",
|
|
17
|
+
"LICENSE"
|
|
18
|
+
],
|
|
19
|
+
"scripts": {
|
|
20
|
+
"test": "node test.js"
|
|
21
|
+
},
|
|
22
|
+
"engines": {
|
|
23
|
+
"node": ">=18"
|
|
24
|
+
},
|
|
25
|
+
"keywords": [
|
|
26
|
+
"agents",
|
|
27
|
+
"subagents",
|
|
28
|
+
"claude-code",
|
|
29
|
+
"claude",
|
|
30
|
+
"anthropic",
|
|
31
|
+
"eval",
|
|
32
|
+
"lint",
|
|
33
|
+
"linter",
|
|
34
|
+
"ai",
|
|
35
|
+
"llm",
|
|
36
|
+
"routing"
|
|
37
|
+
],
|
|
38
|
+
"author": "Yorkis Estevez",
|
|
39
|
+
"license": "MIT",
|
|
40
|
+
"homepage": "https://github.com/yorkisestevez/agent-eval-harness#readme",
|
|
41
|
+
"bugs": {
|
|
42
|
+
"url": "https://github.com/yorkisestevez/agent-eval-harness/issues"
|
|
43
|
+
},
|
|
44
|
+
"repository": {
|
|
45
|
+
"type": "git",
|
|
46
|
+
"url": "git+https://github.com/yorkisestevez/agent-eval-harness.git"
|
|
47
|
+
},
|
|
48
|
+
"publishConfig": {
|
|
49
|
+
"access": "public"
|
|
50
|
+
}
|
|
51
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Routing suite: scoring proxy for the LLM-based agent picker.
|
|
4
|
+
// For each case (prompt + expected agent), score every agent's description
|
|
5
|
+
// with IDF-weighted recall against the prompt. Pass = expected agent ranks
|
|
6
|
+
// first AND has strictly positive margin over runner-up (a tie at the top
|
|
7
|
+
// could go either way through the LLM picker, so it's treated as a fail).
|
|
8
|
+
//
|
|
9
|
+
// Plus an overlap audit: any pair of agents whose descriptions exceed
|
|
10
|
+
// Jaccard 0.20 is informational by default, BLOCKING under --strict.
|
|
11
|
+
|
|
12
|
+
const { tokenize, jaccard, buildIDF, weightedRecall } = require('../lib/text');
|
|
13
|
+
|
|
14
|
+
function routingSuite(agents, cases) {
|
|
15
|
+
const agentTokens = new Map(agents.map(a => [a.name, tokenize(a.description)]));
|
|
16
|
+
const idf = buildIDF(agents);
|
|
17
|
+
const results = [];
|
|
18
|
+
for (const c of cases) {
|
|
19
|
+
const promptTokens = tokenize(c.prompt);
|
|
20
|
+
const scored = agents.map(a => ({
|
|
21
|
+
agent: a.name,
|
|
22
|
+
score: weightedRecall(promptTokens, agentTokens.get(a.name), idf),
|
|
23
|
+
})).sort((x, y) => y.score - x.score);
|
|
24
|
+
const top = scored[0];
|
|
25
|
+
const second = scored[1] || { score: 0 };
|
|
26
|
+
const absMargin = top.score - second.score;
|
|
27
|
+
const relRatio = second.score > 0 ? top.score / second.score : Infinity;
|
|
28
|
+
const pass = top.agent === c.expect_agent && absMargin > 0;
|
|
29
|
+
results.push({
|
|
30
|
+
id: c.id,
|
|
31
|
+
expected: c.expect_agent,
|
|
32
|
+
got: top.agent,
|
|
33
|
+
score: top.score,
|
|
34
|
+
margin: absMargin,
|
|
35
|
+
ratio: relRatio,
|
|
36
|
+
runner_up: second.agent,
|
|
37
|
+
pass,
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
return results;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function overlapAudit(agents, threshold = 0.20) {
|
|
44
|
+
const tok = new Map(agents.map(a => [a.name, tokenize(a.description)]));
|
|
45
|
+
const pairs = [];
|
|
46
|
+
for (let i = 0; i < agents.length; i++) {
|
|
47
|
+
for (let j = i + 1; j < agents.length; j++) {
|
|
48
|
+
const s = jaccard(tok.get(agents[i].name), tok.get(agents[j].name));
|
|
49
|
+
if (s >= threshold) pairs.push({ a: agents[i].name, b: agents[j].name, score: s });
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return pairs.sort((x, y) => y.score - x.score);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
module.exports = { routingSuite, overlapAudit };
|
package/suites/schema.js
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Schema suite: agents declare a structured-output return contract.
|
|
4
|
+
// - body has a "## Return contract" section OR a "must be JSON" instruction
|
|
5
|
+
// - body declares the JSON shape in one of three forms: ```json fenced block,
|
|
6
|
+
// a single-line backticked example with balanced braces, or a Schema-fields
|
|
7
|
+
// bullet list.
|
|
8
|
+
//
|
|
9
|
+
// Plus a fence audit: agents using ```json INSIDE the Return contract section
|
|
10
|
+
// can provoke fence-mimicry in some LLMs (response wraps the JSON in fences,
|
|
11
|
+
// breaking naive JSON.parse). Informational by default; promoted to BLOCKING
|
|
12
|
+
// under --strict / threshold=1.0.
|
|
13
|
+
|
|
14
|
+
function schemaSuite(agents) {
|
|
15
|
+
const results = [];
|
|
16
|
+
for (const a of agents) {
|
|
17
|
+
const hasReturnContract =
|
|
18
|
+
/## Return contract/i.test(a.body) ||
|
|
19
|
+
/response MUST be (a |the )?JSON/i.test(a.body) ||
|
|
20
|
+
/Return JSON only:?/i.test(a.body) ||
|
|
21
|
+
/your final response must be/i.test(a.body) ||
|
|
22
|
+
/STRICT OUTPUT MODE/i.test(a.body);
|
|
23
|
+
const hasJsonBlock =
|
|
24
|
+
/```json[\s\S]*?```/m.test(a.body) ||
|
|
25
|
+
/`\{[\s\S]*?\}`/m.test(a.body) ||
|
|
26
|
+
/## Schema fields|Schema fields:/i.test(a.body);
|
|
27
|
+
results.push({
|
|
28
|
+
agent: a.name,
|
|
29
|
+
pass: hasReturnContract && hasJsonBlock,
|
|
30
|
+
detail: !hasReturnContract ? 'no return-contract section'
|
|
31
|
+
: !hasJsonBlock ? 'no JSON shape declaration'
|
|
32
|
+
: 'ok',
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
return results;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function fenceAudit(agents) {
|
|
39
|
+
const offenders = [];
|
|
40
|
+
for (const a of agents) {
|
|
41
|
+
const m = a.body.match(/##\s*Return contract[\s\S]*$/i);
|
|
42
|
+
const tail = m ? m[0] : '';
|
|
43
|
+
if (/```json[\s\S]*?```/m.test(tail)) offenders.push(a.name);
|
|
44
|
+
}
|
|
45
|
+
return offenders;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
module.exports = { schemaSuite, fenceAudit };
|
package/suites/spawn.js
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Spawn suite: fixture-driven contract test.
|
|
4
|
+
// For each agent in schemas.json, look for a fixture at fixtures/<name>.txt.
|
|
5
|
+
// Extract JSON, validate every required field, type, and enum against the
|
|
6
|
+
// declared schema. Missing fixture is informational by default; under
|
|
7
|
+
// --strict / threshold=1.0 it counts as a failed check.
|
|
8
|
+
|
|
9
|
+
const fs = require('node:fs');
|
|
10
|
+
const path = require('node:path');
|
|
11
|
+
const { extractJson } = require('../lib/extract-json');
|
|
12
|
+
|
|
13
|
+
function loadSchemas(schemasFile) {
|
|
14
|
+
if (!fs.existsSync(schemasFile)) return {};
|
|
15
|
+
return JSON.parse(fs.readFileSync(schemasFile, 'utf8'));
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function loadFixture(fixturesDir, agentName) {
|
|
19
|
+
const p = path.join(fixturesDir, `${agentName}.txt`);
|
|
20
|
+
if (!fs.existsSync(p)) return null;
|
|
21
|
+
return fs.readFileSync(p, 'utf8');
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function jsType(v) {
|
|
25
|
+
if (Array.isArray(v)) return 'array';
|
|
26
|
+
if (v === null) return 'null';
|
|
27
|
+
return typeof v;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function validateAgainst(obj, schema, prefix = '') {
|
|
31
|
+
const errors = [];
|
|
32
|
+
for (const k of schema.required || []) {
|
|
33
|
+
if (!(k in obj)) errors.push(`missing required field: ${prefix}${k}`);
|
|
34
|
+
}
|
|
35
|
+
for (const [k, expected] of Object.entries(schema.types || {})) {
|
|
36
|
+
if (!(k in obj)) continue;
|
|
37
|
+
const actual = jsType(obj[k]);
|
|
38
|
+
if (actual !== expected && !(expected === 'string' && actual === 'null')) {
|
|
39
|
+
errors.push(`type mismatch: ${prefix}${k} expected ${expected}, got ${actual}`);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
for (const [k, allowed] of Object.entries(schema.enums || {})) {
|
|
43
|
+
if (!(k in obj)) continue;
|
|
44
|
+
if (!allowed.includes(obj[k])) {
|
|
45
|
+
errors.push(`enum violation: ${prefix}${k}="${obj[k]}" not in [${allowed.join('|')}]`);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
for (const [k, sub] of Object.entries(schema.nested || {})) {
|
|
49
|
+
if (!(k in obj)) continue;
|
|
50
|
+
if (jsType(obj[k]) !== 'object') continue;
|
|
51
|
+
errors.push(...validateAgainst(obj[k], sub, `${prefix}${k}.`));
|
|
52
|
+
}
|
|
53
|
+
return errors;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function spawnSuite(config, { strict = false } = {}) {
|
|
57
|
+
const schemas = loadSchemas(config.schemasFile);
|
|
58
|
+
const targets = Object.keys(schemas);
|
|
59
|
+
|
|
60
|
+
const out = { results: [], noFixture: [], totalChecks: 0, totalPass: 0, coveredAgents: 0 };
|
|
61
|
+
for (const name of targets) {
|
|
62
|
+
const schema = schemas[name];
|
|
63
|
+
const fixture = loadFixture(config.fixturesDir, name);
|
|
64
|
+
if (!fixture) {
|
|
65
|
+
out.noFixture.push(name);
|
|
66
|
+
if (strict) {
|
|
67
|
+
out.totalChecks++;
|
|
68
|
+
out.results.push({ agent: name, pass: false, reason: 'no-fixture' });
|
|
69
|
+
}
|
|
70
|
+
continue;
|
|
71
|
+
}
|
|
72
|
+
out.coveredAgents++;
|
|
73
|
+
out.totalChecks++;
|
|
74
|
+
|
|
75
|
+
let parsed;
|
|
76
|
+
try {
|
|
77
|
+
parsed = extractJson(fixture);
|
|
78
|
+
} catch (e) {
|
|
79
|
+
out.results.push({ agent: name, pass: false, reason: 'extract-json', detail: e.message });
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const errors = validateAgainst(parsed, schema);
|
|
84
|
+
if (errors.length === 0) {
|
|
85
|
+
out.totalPass++;
|
|
86
|
+
out.results.push({ agent: name, pass: true });
|
|
87
|
+
} else {
|
|
88
|
+
out.results.push({ agent: name, pass: false, reason: 'schema', errors });
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return out;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
module.exports = { spawnSuite, validateAgainst, loadSchemas, loadFixture };
|
package/suites/static.js
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Static suite: per-agent linting of the *.md frontmatter + body.
|
|
4
|
+
// - has-name frontmatter has `name:`
|
|
5
|
+
// - has-description description >= minDescriptionChars
|
|
6
|
+
// - tools<=N tools count between 1 and config.maxTools
|
|
7
|
+
// - tools-valid every tool in config.validTools[]
|
|
8
|
+
// - filename-matches basename(file) === name
|
|
9
|
+
// - description-trigger description names a "use when X" condition
|
|
10
|
+
// - has-scope-discipline body has a Scope/Hard rules/When to refuse section
|
|
11
|
+
// - no-recursion body doesn't itself call subagents
|
|
12
|
+
|
|
13
|
+
const path = require('node:path');
|
|
14
|
+
|
|
15
|
+
function staticSuite(agents, config) {
|
|
16
|
+
const validTools = new Set(config.validTools);
|
|
17
|
+
const triggerRe = new RegExp(config.triggerPattern, 'i');
|
|
18
|
+
const scopeRe = new RegExp(config.scopeSectionPattern, 'i');
|
|
19
|
+
const minDesc = config.minDescriptionChars;
|
|
20
|
+
const maxTools = config.maxTools;
|
|
21
|
+
|
|
22
|
+
const results = [];
|
|
23
|
+
for (const a of agents) {
|
|
24
|
+
const checks = [];
|
|
25
|
+
checks.push({ name: 'has-name', pass: !!a.name });
|
|
26
|
+
checks.push({ name: 'has-description', pass: a.description.length >= minDesc });
|
|
27
|
+
checks.push({ name: `tools<=${maxTools}`, pass: a.tools.length <= maxTools && a.tools.length >= 1 });
|
|
28
|
+
checks.push({
|
|
29
|
+
name: 'tools-valid',
|
|
30
|
+
pass: a.tools.every(t => validTools.has(t)),
|
|
31
|
+
detail: a.tools.filter(t => !validTools.has(t)).join(','),
|
|
32
|
+
});
|
|
33
|
+
checks.push({
|
|
34
|
+
name: 'filename-matches-name',
|
|
35
|
+
pass: path.basename(a.file, '.md') === a.name,
|
|
36
|
+
});
|
|
37
|
+
checks.push({ name: 'description-uses-when', pass: triggerRe.test(a.description) });
|
|
38
|
+
checks.push({ name: 'has-scope-discipline', pass: scopeRe.test(a.body) });
|
|
39
|
+
checks.push({ name: 'no-recursion', pass: !/(subagent_type|Task\s*\(|Agent\s*\()/i.test(a.body) });
|
|
40
|
+
results.push({ agent: a.name, checks });
|
|
41
|
+
}
|
|
42
|
+
return results;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
module.exports = { staticSuite };
|