@hone-ai/cli 1.4.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/hone-cli.js +239 -2
- package/lib/doctor-bind-default.js +4 -0
- package/lib/eval-contracts.js +256 -0
- package/lib/eval-graders.js +99 -0
- package/lib/eval-regression.js +202 -0
- package/lib/eval-runner.js +183 -0
- package/package.json +3 -2
package/hone-cli.js
CHANGED
|
@@ -1299,6 +1299,138 @@ program
|
|
|
1299
1299
|
console.log('Verification complete.');
|
|
1300
1300
|
});
|
|
1301
1301
|
|
|
1302
|
+
// ── USAGE command (#251) ─────────────────────────────────────────────────────
|
|
1303
|
+
program
|
|
1304
|
+
.command('usage')
|
|
1305
|
+
.description('Show current month token usage and budget status')
|
|
1306
|
+
.option('--format <fmt>', 'Output format: pretty | json', 'pretty')
|
|
1307
|
+
.action(async (opts) => {
|
|
1308
|
+
const config = getConfig();
|
|
1309
|
+
const client = api(config);
|
|
1310
|
+
|
|
1311
|
+
try {
|
|
1312
|
+
const { data } = await client.get('/usage/me');
|
|
1313
|
+
|
|
1314
|
+
if (opts.format === 'json') {
|
|
1315
|
+
console.log(JSON.stringify(data, null, 2));
|
|
1316
|
+
return;
|
|
1317
|
+
}
|
|
1318
|
+
|
|
1319
|
+
console.log('');
|
|
1320
|
+
console.log('Hone AI — Token Usage');
|
|
1321
|
+
console.log('================================');
|
|
1322
|
+
console.log(`Org: ${data.org}`);
|
|
1323
|
+
console.log(`Month: ${data.current_month}`);
|
|
1324
|
+
console.log(`Tokens used: ${data.used_tokens.toLocaleString()}`);
|
|
1325
|
+
console.log(`Cost (est): $${data.used_cost_usd.toFixed(2)}`);
|
|
1326
|
+
|
|
1327
|
+
if (data.monthly_budget != null) {
|
|
1328
|
+
console.log(`Budget: ${data.monthly_budget.toLocaleString()} tokens`);
|
|
1329
|
+
console.log(`Used: ${data.budget_pct}%`);
|
|
1330
|
+
console.log(`Remaining: ${data.remaining.toLocaleString()}`);
|
|
1331
|
+
if (data.exceeded) {
|
|
1332
|
+
console.log(`Status: EXCEEDED — resets ${data.resetsAt.split('T')[0]}`);
|
|
1333
|
+
} else if (data.budget_pct >= data.budget_alert_pct) {
|
|
1334
|
+
console.log(`Status: WARNING — approaching budget (${data.budget_pct}% of ${data.budget_alert_pct}% alert threshold)`);
|
|
1335
|
+
} else {
|
|
1336
|
+
console.log(`Status: OK`);
|
|
1337
|
+
}
|
|
1338
|
+
} else {
|
|
1339
|
+
console.log(`Budget: unlimited`);
|
|
1340
|
+
}
|
|
1341
|
+
|
|
1342
|
+
if (data.by_job && data.by_job.length > 0) {
|
|
1343
|
+
console.log('');
|
|
1344
|
+
console.log('Recent derive jobs:');
|
|
1345
|
+
for (const j of data.by_job.slice(0, 10)) {
|
|
1346
|
+
console.log(` ${j.date} | ${j.job_id} | ${j.tokens.toLocaleString()} tokens | $${Number(j.cost_usd).toFixed(2)}`);
|
|
1347
|
+
}
|
|
1348
|
+
}
|
|
1349
|
+
|
|
1350
|
+
console.log('');
|
|
1351
|
+
} catch (e) {
|
|
1352
|
+
if (e.response?.status === 401) {
|
|
1353
|
+
console.error('Not authenticated. Run: hone init');
|
|
1354
|
+
} else {
|
|
1355
|
+
console.error(`Failed to fetch usage: ${e.message}`);
|
|
1356
|
+
}
|
|
1357
|
+
process.exit(1);
|
|
1358
|
+
}
|
|
1359
|
+
});
|
|
1360
|
+
|
|
1361
|
+
// ── ADMIN-USAGE command ──────────────────────────────────────────────────────
|
|
1362
|
+
program
|
|
1363
|
+
.command('admin-usage')
|
|
1364
|
+
.description('Admin dashboard: cross-org token usage, budgets, alerts, trends')
|
|
1365
|
+
.option('--format <fmt>', 'Output format: pretty | json', 'pretty')
|
|
1366
|
+
.action(async (opts) => {
|
|
1367
|
+
const rc = readRc();
|
|
1368
|
+
const adminKey = process.env.HONE_ADMIN_KEY || rc.admin_key;
|
|
1369
|
+
const apiUrl = process.env.HONE_API || rc.api || 'https://api.hone.ai';
|
|
1370
|
+
|
|
1371
|
+
if (!adminKey) {
|
|
1372
|
+
console.error('Error: Admin key not found.');
|
|
1373
|
+
console.error('Set HONE_ADMIN_KEY env var, or add "admin_key" to ~/.honerc');
|
|
1374
|
+
process.exit(1);
|
|
1375
|
+
}
|
|
1376
|
+
|
|
1377
|
+
try {
|
|
1378
|
+
const { data } = await axios.get(`${apiUrl}/admin/usage`, {
|
|
1379
|
+
headers: { 'x-admin-key': adminKey, 'User-Agent': `@hone-ai/cli/${pkg.version}` },
|
|
1380
|
+
timeout: 15000,
|
|
1381
|
+
});
|
|
1382
|
+
|
|
1383
|
+
if (opts.format === 'json') {
|
|
1384
|
+
console.log(JSON.stringify(data, null, 2));
|
|
1385
|
+
return;
|
|
1386
|
+
}
|
|
1387
|
+
|
|
1388
|
+
console.log('');
|
|
1389
|
+
console.log('Hone AI — Admin Dashboard');
|
|
1390
|
+
console.log('================================');
|
|
1391
|
+
console.log(`Month: ${data.current_month}`);
|
|
1392
|
+
console.log(`Total orgs: ${data.platform_totals.total_orgs} (${data.platform_totals.active_orgs} active)`);
|
|
1393
|
+
console.log(`Total tokens: ${data.platform_totals.total_tokens.toLocaleString()}`);
|
|
1394
|
+
console.log(`Total cost: $${data.platform_totals.total_cost_usd.toFixed(2)}`);
|
|
1395
|
+
console.log(`Total calls: ${data.platform_totals.total_calls}`);
|
|
1396
|
+
|
|
1397
|
+
if (data.alerts.length > 0) {
|
|
1398
|
+
console.log('');
|
|
1399
|
+
console.log('Alerts:');
|
|
1400
|
+
for (const a of data.alerts) {
|
|
1401
|
+
const icon = a.level === 'critical' ? '!!' : a.level === 'warning' ? ' !' : ' i';
|
|
1402
|
+
console.log(` [${icon}] ${a.org}: ${a.message}`);
|
|
1403
|
+
}
|
|
1404
|
+
}
|
|
1405
|
+
|
|
1406
|
+
if (data.orgs.length > 0) {
|
|
1407
|
+
console.log('');
|
|
1408
|
+
console.log('Per-org usage:');
|
|
1409
|
+
console.log(' Org Tier Tokens Cost Budget% Trend Fails');
|
|
1410
|
+
console.log(' --- ---- ------ ---- ------- ----- -----');
|
|
1411
|
+
for (const o of data.orgs) {
|
|
1412
|
+
const name = o.org.padEnd(20).slice(0, 20);
|
|
1413
|
+
const tier = (o.tier || '').padEnd(10).slice(0, 10);
|
|
1414
|
+
const tokens = String(o.total_tokens.toLocaleString()).padStart(12);
|
|
1415
|
+
const cost = ('$' + o.total_cost_usd.toFixed(2)).padStart(8);
|
|
1416
|
+
const pct = o.monthly_budget != null ? (o.budget_pct + '%').padStart(8) : ' n/a';
|
|
1417
|
+
const trend = o.trend_pct != null ? ((o.trend_pct >= 0 ? '+' : '') + o.trend_pct + '%').padStart(6) : ' n/a';
|
|
1418
|
+
const fails = String(o.failed_jobs).padStart(5);
|
|
1419
|
+
console.log(` ${name} ${tier} ${tokens} ${cost} ${pct} ${trend} ${fails}`);
|
|
1420
|
+
}
|
|
1421
|
+
}
|
|
1422
|
+
|
|
1423
|
+
console.log('');
|
|
1424
|
+
} catch (e) {
|
|
1425
|
+
if (e.response?.status === 401) {
|
|
1426
|
+
console.error('Invalid admin key. Check HONE_ADMIN_KEY or ~/.honerc admin_key.');
|
|
1427
|
+
} else {
|
|
1428
|
+
console.error(`Failed to fetch admin dashboard: ${e.message}`);
|
|
1429
|
+
}
|
|
1430
|
+
process.exit(1);
|
|
1431
|
+
}
|
|
1432
|
+
});
|
|
1433
|
+
|
|
1302
1434
|
// ── SYNC command ──────────────────────────────────────────────────────────────
|
|
1303
1435
|
program
|
|
1304
1436
|
.command('sync')
|
|
@@ -2618,8 +2750,9 @@ program
|
|
|
2618
2750
|
}
|
|
2619
2751
|
const storyId = ps.extractStoryIdFromBranch(branch);
|
|
2620
2752
|
if (!storyId) {
|
|
2621
|
-
console.
|
|
2622
|
-
|
|
2753
|
+
console.log(`No story branch active (current: ${branch}).`);
|
|
2754
|
+
console.log('Switch to a story branch (e.g., feat/HC-001-description) to use hone next.');
|
|
2755
|
+
process.exit(0);
|
|
2623
2756
|
}
|
|
2624
2757
|
const storyDir = path.join(pipelineRoot, storyId);
|
|
2625
2758
|
const metaPath = path.join(storyDir, 'metadata.yml');
|
|
@@ -3997,6 +4130,110 @@ program
|
|
|
3997
4130
|
}, null, 2));
|
|
3998
4131
|
});
|
|
3999
4132
|
|
|
4133
|
+
// ── HC-019d: Agent Eval Runner ────────────────────────────────────────────────
|
|
4134
|
+
program
|
|
4135
|
+
.command('eval')
|
|
4136
|
+
.description('Run eval scenarios against agent prompts (deterministic, zero LLM tokens)')
|
|
4137
|
+
.option('--agent <name>', 'Run evals for a specific agent only')
|
|
4138
|
+
.option('--tag <tag>', 'Filter scenarios by tag (e.g., smoke, regression)')
|
|
4139
|
+
.option('--scenario <id>', 'Run a single scenario by ID')
|
|
4140
|
+
.option('--format <fmt>', 'Output format: pretty | json', 'pretty')
|
|
4141
|
+
.option('--evals-dir <path>', 'Override eval scenarios directory')
|
|
4142
|
+
.option('--fail-fast', 'Stop on first failure')
|
|
4143
|
+
.option('--contracts', 'Run contract validation between pipeline agents')
|
|
4144
|
+
.option('--snapshot', 'Save current eval + contract results as regression baseline')
|
|
4145
|
+
.option('--regression', 'Compare current results against saved baseline (detect drift)')
|
|
4146
|
+
.action(async (opts) => {
|
|
4147
|
+
const path = require('path');
|
|
4148
|
+
const fs = require('fs');
|
|
4149
|
+
const yaml = require('js-yaml');
|
|
4150
|
+
|
|
4151
|
+
// Load agent prompts from seed-agent-prompts.js
|
|
4152
|
+
const seedPath = path.resolve(__dirname, '..', 'scripts', 'seed-agent-prompts.js');
|
|
4153
|
+
const { AGENT_PROMPTS } = require(seedPath);
|
|
4154
|
+
const evalDir = opts.evalsDir || path.resolve(__dirname, '..', 'evals');
|
|
4155
|
+
|
|
4156
|
+
// Snapshot mode (HC-019h): save baseline
|
|
4157
|
+
if (opts.snapshot) {
|
|
4158
|
+
const { loadScenarios, runAllScenarios } = require('./lib/eval-runner');
|
|
4159
|
+
const { validateAllContracts } = require('./lib/eval-contracts');
|
|
4160
|
+
const { saveBaseline } = require('./lib/eval-regression');
|
|
4161
|
+
|
|
4162
|
+
const scenarios = loadScenarios({
|
|
4163
|
+
evalDir, readFile: (p) => fs.readFileSync(p, 'utf8'),
|
|
4164
|
+
listDir: (p) => fs.readdirSync(p), isDir: (p) => fs.statSync(p).isDirectory(),
|
|
4165
|
+
parseYaml: (text) => yaml.load(text),
|
|
4166
|
+
});
|
|
4167
|
+
const evalResults = runAllScenarios(scenarios, AGENT_PROMPTS);
|
|
4168
|
+
const contractResults = validateAllContracts(AGENT_PROMPTS);
|
|
4169
|
+
const { saved } = saveBaseline(evalResults, contractResults,
|
|
4170
|
+
(p, c) => fs.writeFileSync(p, c, 'utf8'), evalDir);
|
|
4171
|
+
console.log(`Baseline saved: ${saved} entries → evals/.baseline.json`);
|
|
4172
|
+
process.exit(0);
|
|
4173
|
+
}
|
|
4174
|
+
|
|
4175
|
+
// Regression mode (HC-019h): compare against baseline
|
|
4176
|
+
if (opts.regression) {
|
|
4177
|
+
const { loadScenarios, runAllScenarios } = require('./lib/eval-runner');
|
|
4178
|
+
const { validateAllContracts } = require('./lib/eval-contracts');
|
|
4179
|
+
const { loadBaseline, detectRegressions, formatRegressionResults } = require('./lib/eval-regression');
|
|
4180
|
+
|
|
4181
|
+
const baseline = loadBaseline((p) => fs.readFileSync(p, 'utf8'), evalDir);
|
|
4182
|
+
if (!baseline) {
|
|
4183
|
+
console.error('No baseline found. Run: hone eval --snapshot');
|
|
4184
|
+
process.exit(1);
|
|
4185
|
+
}
|
|
4186
|
+
|
|
4187
|
+
const scenarios = loadScenarios({
|
|
4188
|
+
evalDir, readFile: (p) => fs.readFileSync(p, 'utf8'),
|
|
4189
|
+
listDir: (p) => fs.readdirSync(p), isDir: (p) => fs.statSync(p).isDirectory(),
|
|
4190
|
+
parseYaml: (text) => yaml.load(text),
|
|
4191
|
+
});
|
|
4192
|
+
const evalResults = runAllScenarios(scenarios, AGENT_PROMPTS);
|
|
4193
|
+
const contractResults = validateAllContracts(AGENT_PROMPTS);
|
|
4194
|
+
const results = detectRegressions(evalResults, contractResults, baseline);
|
|
4195
|
+
console.log(formatRegressionResults(results, opts.format));
|
|
4196
|
+
process.exit(results.summary.regressions > 0 ? 1 : 0);
|
|
4197
|
+
}
|
|
4198
|
+
|
|
4199
|
+
// Contract validation mode (HC-019g)
|
|
4200
|
+
if (opts.contracts) {
|
|
4201
|
+
const { validateAllContracts, formatContractResults } = require('./lib/eval-contracts');
|
|
4202
|
+
const results = validateAllContracts(AGENT_PROMPTS);
|
|
4203
|
+
console.log(formatContractResults(results, opts.format));
|
|
4204
|
+
process.exit(results.failed > 0 ? 1 : 0);
|
|
4205
|
+
}
|
|
4206
|
+
|
|
4207
|
+
// Scenario evaluation mode
|
|
4208
|
+
const { loadScenarios, runAllScenarios, formatResults } = require('./lib/eval-runner');
|
|
4209
|
+
|
|
4210
|
+
if (!fs.existsSync(evalDir)) {
|
|
4211
|
+
console.error(`Eval directory not found: ${evalDir}`);
|
|
4212
|
+
process.exit(1);
|
|
4213
|
+
}
|
|
4214
|
+
|
|
4215
|
+
const scenarios = loadScenarios({
|
|
4216
|
+
evalDir,
|
|
4217
|
+
agent: opts.agent,
|
|
4218
|
+
tag: opts.tag,
|
|
4219
|
+
scenarioId: opts.scenario,
|
|
4220
|
+
readFile: (p) => fs.readFileSync(p, 'utf8'),
|
|
4221
|
+
listDir: (p) => fs.readdirSync(p),
|
|
4222
|
+
isDir: (p) => fs.statSync(p).isDirectory(),
|
|
4223
|
+
parseYaml: (text) => yaml.load(text),
|
|
4224
|
+
});
|
|
4225
|
+
|
|
4226
|
+
if (scenarios.length === 0) {
|
|
4227
|
+
console.log('No eval scenarios found matching filters.');
|
|
4228
|
+
process.exit(0);
|
|
4229
|
+
}
|
|
4230
|
+
|
|
4231
|
+
const results = runAllScenarios(scenarios, AGENT_PROMPTS, { failFast: opts.failFast });
|
|
4232
|
+
console.log(formatResults(results, opts.format));
|
|
4233
|
+
|
|
4234
|
+
process.exit(results.failed + results.errors > 0 ? 1 : 0);
|
|
4235
|
+
});
|
|
4236
|
+
|
|
4000
4237
|
// ── CLI setup ─────────────────────────────────────────────────────────────────
|
|
4001
4238
|
program
|
|
4002
4239
|
.name('hone')
|
|
@@ -90,6 +90,10 @@ function checkBindDefault(args) {
|
|
|
90
90
|
if (/\bos\.getenv\b|\bprocess\.env\b|\bgetenv\(/.test(content)) continue;
|
|
91
91
|
// Filter out lines that are clearly comments mentioning the anti-pattern
|
|
92
92
|
if (/^\s*(#|\/\/|--).*0\.0\.0\.0/.test(content)) continue;
|
|
93
|
+
// Filter out docstrings/multiline strings (Python """, JS template literals, block comments)
|
|
94
|
+
if (/^\s*("""|'''|\/\*|\*)/.test(content)) continue;
|
|
95
|
+
// Filter out lines that describe/document the pattern rather than use it
|
|
96
|
+
if (/override|example|usage|default.*is|should|must|can\s/i.test(content) && !/=\s*['"]0\.0\.0\.0/.test(content)) continue;
|
|
93
97
|
|
|
94
98
|
offenders.push({ file, line: lineNo, content: content.trim().slice(0, 100) });
|
|
95
99
|
}
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* eval-contracts.js — HC-019g contract testing between pipeline agents.
|
|
4
|
+
*
|
|
5
|
+
* Validates that each agent's prompt correctly references:
|
|
6
|
+
* 1. The prior step's artifact (input dependency)
|
|
7
|
+
* 2. The metadata.yml gate check (validation contract)
|
|
8
|
+
* 3. The output artifact it produces (output contract)
|
|
9
|
+
*
|
|
10
|
+
* This is a deterministic structural check — zero LLM tokens.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Pipeline agent contracts.
|
|
15
|
+
* Each entry defines what the agent must validate (input) and produce (output).
|
|
16
|
+
*/
|
|
17
|
+
const PIPELINE_CONTRACTS = [
|
|
18
|
+
{
|
|
19
|
+
agent: 'story-groomer',
|
|
20
|
+
step: 0,
|
|
21
|
+
inputArtifact: null, // first in pipeline — no prior step
|
|
22
|
+
inputGate: null,
|
|
23
|
+
outputArtifact: 'step-0-grooming.md',
|
|
24
|
+
outputGate: 'step_0',
|
|
25
|
+
metadataField: 'step_0.gate_result',
|
|
26
|
+
extraChecks: [
|
|
27
|
+
{ text: 'metadata.yml', check: 'creates_metadata', detail: 'creates metadata.yml' },
|
|
28
|
+
{ text: 'Initialize Pipeline', check: 'pipeline_init', detail: 'initializes pipeline directory' },
|
|
29
|
+
{ text: '.github/pipeline/', check: 'pipeline_dir', detail: 'creates .github/pipeline/ directory' },
|
|
30
|
+
],
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
agent: 'implementation-planner',
|
|
34
|
+
step: 1,
|
|
35
|
+
inputArtifact: 'step-0-grooming.md',
|
|
36
|
+
inputGate: 'step_0.gate_result',
|
|
37
|
+
outputArtifact: 'step-1-plan.md',
|
|
38
|
+
outputGate: 'step_1',
|
|
39
|
+
metadataField: 'step_1.gate_result',
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
agent: 'unit-test-case-writer',
|
|
43
|
+
step: 2,
|
|
44
|
+
inputArtifact: 'step-1-plan.md',
|
|
45
|
+
inputGate: 'step_1.gate_result',
|
|
46
|
+
outputArtifact: 'step-2-tests.md',
|
|
47
|
+
outputGate: 'step_2',
|
|
48
|
+
metadataField: 'step_2.gate_result',
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
agent: 'e2e-qa-planner',
|
|
52
|
+
step: 3,
|
|
53
|
+
inputArtifact: 'step-2-tests.md',
|
|
54
|
+
inputGate: 'step_2.gate_result',
|
|
55
|
+
outputArtifact: 'step-3-e2e-plan.md',
|
|
56
|
+
outputGate: 'step_3',
|
|
57
|
+
metadataField: 'step_3.gate_result',
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
agent: 'code-builder',
|
|
61
|
+
step: 4,
|
|
62
|
+
inputArtifact: 'step-3-e2e-plan.md',
|
|
63
|
+
inputGate: 'step_3.gate_result',
|
|
64
|
+
outputArtifact: 'step-4-implementation.md',
|
|
65
|
+
outputGate: 'step_4',
|
|
66
|
+
metadataField: 'step_4.gate_result',
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
agent: 'code-reviewer',
|
|
70
|
+
step: 5,
|
|
71
|
+
inputArtifact: 'step-4-implementation.md',
|
|
72
|
+
inputGate: 'step_4.gate_result',
|
|
73
|
+
outputArtifact: 'step-5-review.md',
|
|
74
|
+
outputGate: 'step_5',
|
|
75
|
+
metadataField: 'step_5.gate_result',
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
agent: 'security-agent',
|
|
79
|
+
step: '5d',
|
|
80
|
+
inputArtifact: 'step-5-review.md',
|
|
81
|
+
inputGate: 'step_5.status',
|
|
82
|
+
outputArtifact: 'step-5d-security.md',
|
|
83
|
+
outputGate: null,
|
|
84
|
+
metadataField: null,
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
agent: 'performance-agent',
|
|
88
|
+
step: '5e',
|
|
89
|
+
inputArtifact: 'step-5-review.md',
|
|
90
|
+
inputGate: 'step_5.status',
|
|
91
|
+
outputArtifact: 'step-5e-performance.md',
|
|
92
|
+
outputGate: null,
|
|
93
|
+
metadataField: null,
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
agent: 'delivery-architect',
|
|
97
|
+
step: 'independent',
|
|
98
|
+
inputArtifact: null,
|
|
99
|
+
inputGate: null,
|
|
100
|
+
outputArtifact: 'EXECUTION_PLAN.yml',
|
|
101
|
+
outputGate: null,
|
|
102
|
+
metadataField: null,
|
|
103
|
+
extraChecks: [
|
|
104
|
+
{ text: 'Checklist A', check: 'checklist_a', detail: 'has Checklist A (epic decomposition)' },
|
|
105
|
+
{ text: 'Checklist B', check: 'checklist_b', detail: 'has Checklist B (architecture validation)' },
|
|
106
|
+
{ text: 'test_strategy', check: 'test_strategy', detail: 'includes test_strategy in plan' },
|
|
107
|
+
],
|
|
108
|
+
},
|
|
109
|
+
];
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Validate agent prompt against its contract.
|
|
113
|
+
* @param {string} promptText — agent prompt content
|
|
114
|
+
* @param {object} contract — from PIPELINE_CONTRACTS
|
|
115
|
+
* @returns {{ agent, checks: Array<{ check, passed, detail }> }}
|
|
116
|
+
*/
|
|
117
|
+
function validateContract(promptText, contract) {
|
|
118
|
+
const checks = [];
|
|
119
|
+
|
|
120
|
+
// Check 1: Input artifact reference (skip for story-groomer — first in pipeline)
|
|
121
|
+
if (contract.inputArtifact) {
|
|
122
|
+
const found = promptText.includes(contract.inputArtifact);
|
|
123
|
+
checks.push({
|
|
124
|
+
check: 'input_artifact',
|
|
125
|
+
passed: found,
|
|
126
|
+
detail: found
|
|
127
|
+
? `references prior step "${contract.inputArtifact}"`
|
|
128
|
+
: `MISSING reference to prior step "${contract.inputArtifact}"`,
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Check 2: Input gate validation
|
|
133
|
+
if (contract.inputGate) {
|
|
134
|
+
const found = promptText.includes(contract.inputGate);
|
|
135
|
+
checks.push({
|
|
136
|
+
check: 'input_gate',
|
|
137
|
+
passed: found,
|
|
138
|
+
detail: found
|
|
139
|
+
? `validates "${contract.inputGate}" before proceeding`
|
|
140
|
+
: `MISSING validation of "${contract.inputGate}"`,
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Check 3: Output artifact reference
|
|
145
|
+
if (contract.outputArtifact) {
|
|
146
|
+
const found = promptText.includes(contract.outputArtifact);
|
|
147
|
+
checks.push({
|
|
148
|
+
check: 'output_artifact',
|
|
149
|
+
passed: found,
|
|
150
|
+
detail: found
|
|
151
|
+
? `produces "${contract.outputArtifact}"`
|
|
152
|
+
: `MISSING output artifact "${contract.outputArtifact}"`,
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Check 4: metadata.yml update
|
|
157
|
+
if (contract.metadataField) {
|
|
158
|
+
const found = promptText.includes('metadata.yml');
|
|
159
|
+
checks.push({
|
|
160
|
+
check: 'metadata_update',
|
|
161
|
+
passed: found,
|
|
162
|
+
detail: found
|
|
163
|
+
? `updates metadata.yml`
|
|
164
|
+
: `MISSING metadata.yml update`,
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Check 5: PIPELINE VALIDATION block
|
|
169
|
+
if (contract.inputArtifact) {
|
|
170
|
+
const found = promptText.includes('PIPELINE VALIDATION');
|
|
171
|
+
checks.push({
|
|
172
|
+
check: 'pipeline_validation_block',
|
|
173
|
+
passed: found,
|
|
174
|
+
detail: found
|
|
175
|
+
? `has PIPELINE VALIDATION block`
|
|
176
|
+
: `MISSING PIPELINE VALIDATION block`,
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Check 6: Extra checks (agent-specific)
|
|
181
|
+
if (contract.extraChecks) {
|
|
182
|
+
for (const ec of contract.extraChecks) {
|
|
183
|
+
const found = promptText.includes(ec.text);
|
|
184
|
+
checks.push({
|
|
185
|
+
check: ec.check,
|
|
186
|
+
passed: found,
|
|
187
|
+
detail: found ? ec.detail : `MISSING: ${ec.detail}`,
|
|
188
|
+
});
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
return {
|
|
193
|
+
agent: contract.agent,
|
|
194
|
+
step: contract.step,
|
|
195
|
+
checks,
|
|
196
|
+
passed: checks.every(c => c.passed),
|
|
197
|
+
failures: checks.filter(c => !c.passed),
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Run all contract validations.
|
|
203
|
+
* @param {object} agentPrompts — { agentName: promptText }
|
|
204
|
+
* @returns {{ total, passed, failed, results }}
|
|
205
|
+
*/
|
|
206
|
+
function validateAllContracts(agentPrompts) {
|
|
207
|
+
const results = [];
|
|
208
|
+
|
|
209
|
+
for (const contract of PIPELINE_CONTRACTS) {
|
|
210
|
+
const promptText = agentPrompts[contract.agent];
|
|
211
|
+
if (!promptText) {
|
|
212
|
+
results.push({
|
|
213
|
+
agent: contract.agent,
|
|
214
|
+
step: contract.step,
|
|
215
|
+
checks: [],
|
|
216
|
+
passed: false,
|
|
217
|
+
failures: [{ check: 'agent_exists', passed: false, detail: `agent "${contract.agent}" not found` }],
|
|
218
|
+
});
|
|
219
|
+
continue;
|
|
220
|
+
}
|
|
221
|
+
results.push(validateContract(promptText, contract));
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
return {
|
|
225
|
+
total: results.length,
|
|
226
|
+
passed: results.filter(r => r.passed).length,
|
|
227
|
+
failed: results.filter(r => !r.passed).length,
|
|
228
|
+
results,
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* Format contract results.
|
|
234
|
+
*/
|
|
235
|
+
function formatContractResults(results, format = 'pretty') {
|
|
236
|
+
if (format === 'json') return JSON.stringify(results, null, 2);
|
|
237
|
+
|
|
238
|
+
const lines = ['', 'Hone AI — Agent Contract Validation', '====================================', ''];
|
|
239
|
+
|
|
240
|
+
for (const r of results.results) {
|
|
241
|
+
const icon = r.passed ? 'PASS' : 'FAIL';
|
|
242
|
+
lines.push(`[${icon}] Step ${r.step}: ${r.agent} (${r.checks.length} checks)`);
|
|
243
|
+
for (const f of r.failures) {
|
|
244
|
+
lines.push(` x ${f.check}: ${f.detail}`);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
lines.push('');
|
|
249
|
+
lines.push('----------------------------------');
|
|
250
|
+
lines.push(`Summary: ${results.total} agents | ${results.passed} passed | ${results.failed} failed`);
|
|
251
|
+
lines.push('');
|
|
252
|
+
|
|
253
|
+
return lines.join('\n');
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
module.exports = { PIPELINE_CONTRACTS, validateContract, validateAllContracts, formatContractResults };
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* eval-graders.js — HC-019d deterministic grading checks for agent eval scenarios.
|
|
4
|
+
*
|
|
5
|
+
* Each grader is a pure function: (text, config) => { passed, detail }
|
|
6
|
+
* Zero LLM tokens — string/regex/structural checks only.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
function contains(text, { value, case_insensitive = false }) {
|
|
10
|
+
const haystack = case_insensitive ? text.toLowerCase() : text;
|
|
11
|
+
const needle = case_insensitive ? value.toLowerCase() : value;
|
|
12
|
+
const found = haystack.includes(needle);
|
|
13
|
+
return { passed: found, detail: found ? `found "${value}"` : `"${value}" NOT FOUND` };
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function notContains(text, { value, case_insensitive = false }) {
|
|
17
|
+
const haystack = case_insensitive ? text.toLowerCase() : text;
|
|
18
|
+
const needle = case_insensitive ? value.toLowerCase() : value;
|
|
19
|
+
const found = haystack.includes(needle);
|
|
20
|
+
return { passed: !found, detail: found ? `"${value}" FOUND (should be absent)` : `"${value}" correctly absent` };
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
function regex(text, { pattern, flags = '' }) {
|
|
24
|
+
try {
|
|
25
|
+
const re = new RegExp(pattern, flags);
|
|
26
|
+
const match = re.test(text);
|
|
27
|
+
return { passed: match, detail: match ? `matched /${pattern}/` : `/${pattern}/ did NOT match` };
|
|
28
|
+
} catch (e) {
|
|
29
|
+
return { passed: false, detail: `invalid regex /${pattern}/: ${e.message}` };
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function sectionExists(text, { heading }) {
|
|
34
|
+
const re = new RegExp(`^#{1,4}\\s+${escapeRegex(heading)}`, 'mi');
|
|
35
|
+
const found = re.test(text);
|
|
36
|
+
return { passed: found, detail: found ? `section "${heading}" found` : `section "${heading}" NOT FOUND` };
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function wordCount(text, { min = 0, max = Infinity }) {
|
|
40
|
+
const count = text.split(/\s+/).filter(Boolean).length;
|
|
41
|
+
const passed = count >= min && count <= max;
|
|
42
|
+
return { passed, detail: `${count} words (expected ${min}-${max === Infinity ? '∞' : max})` };
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function jsonValid(text) {
|
|
46
|
+
try {
|
|
47
|
+
JSON.parse(text);
|
|
48
|
+
return { passed: true, detail: 'valid JSON' };
|
|
49
|
+
} catch (e) {
|
|
50
|
+
return { passed: false, detail: `invalid JSON: ${e.message}` };
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function yamlValid(text) {
|
|
55
|
+
try {
|
|
56
|
+
require('js-yaml').load(text);
|
|
57
|
+
return { passed: true, detail: 'valid YAML' };
|
|
58
|
+
} catch (e) {
|
|
59
|
+
return { passed: false, detail: `invalid YAML: ${e.message}` };
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function lineCount(text, { min = 0, max = Infinity }) {
|
|
64
|
+
const count = text.split('\n').length;
|
|
65
|
+
const passed = count >= min && count <= max;
|
|
66
|
+
return { passed, detail: `${count} lines (expected ${min}-${max === Infinity ? '∞' : max})` };
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// ── Dispatch ─────────────────────────────────────────────────────
|
|
70
|
+
|
|
71
|
+
const GRADERS = {
|
|
72
|
+
contains,
|
|
73
|
+
not_contains: notContains,
|
|
74
|
+
regex,
|
|
75
|
+
section_exists: sectionExists,
|
|
76
|
+
word_count: wordCount,
|
|
77
|
+
json_valid: jsonValid,
|
|
78
|
+
yaml_valid: yamlValid,
|
|
79
|
+
line_count: lineCount,
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Run a single grading check.
|
|
84
|
+
* @param {string} text — the text to grade (prompt content or LLM output)
|
|
85
|
+
* @param {{ type: string, ...config }} check
|
|
86
|
+
* @returns {{ type, passed, detail }}
|
|
87
|
+
*/
|
|
88
|
+
function runCheck(text, check) {
|
|
89
|
+
const grader = GRADERS[check.type];
|
|
90
|
+
if (!grader) return { type: check.type, passed: false, detail: `unknown grader type "${check.type}"` };
|
|
91
|
+
const result = grader(text, check);
|
|
92
|
+
return { type: check.type, ...result };
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function escapeRegex(str) {
|
|
96
|
+
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
module.exports = { runCheck, GRADERS, contains, notContains, regex, sectionExists, wordCount, jsonValid, yamlValid, lineCount };
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* eval-regression.js — HC-019h prompt regression testing.
|
|
4
|
+
*
|
|
5
|
+
* Saves eval results as a baseline snapshot, then detects drift
|
|
6
|
+
* when a prompt change causes a previously-passing eval to fail.
|
|
7
|
+
*
|
|
8
|
+
* Baseline file: evals/.baseline.json
|
|
9
|
+
* Pure helper with injected I/O.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
const BASELINE_FILENAME = '.baseline.json';
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Save current eval results as baseline.
|
|
16
|
+
* @param {object} results — from runAllScenarios
|
|
17
|
+
* @param {object} contractResults — from validateAllContracts
|
|
18
|
+
* @param {(path, content) => void} writeFile
|
|
19
|
+
* @param {string} evalDir
|
|
20
|
+
* @returns {{ saved: number }}
|
|
21
|
+
*/
|
|
22
|
+
function saveBaseline(results, contractResults, writeFile, evalDir) {
|
|
23
|
+
const baseline = {
|
|
24
|
+
created_at: new Date().toISOString(),
|
|
25
|
+
eval_scenarios: {},
|
|
26
|
+
contracts: {},
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
for (const s of results.scenarios) {
|
|
30
|
+
baseline.eval_scenarios[s.id] = {
|
|
31
|
+
agent: s.agent,
|
|
32
|
+
result: s.result,
|
|
33
|
+
checks: s.checks,
|
|
34
|
+
checks_passed: s.checks_passed,
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
for (const r of contractResults.results) {
|
|
39
|
+
baseline.contracts[r.agent] = {
|
|
40
|
+
step: r.step,
|
|
41
|
+
passed: r.passed,
|
|
42
|
+
checks: r.checks.length,
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
writeFile(`${evalDir}/${BASELINE_FILENAME}`, JSON.stringify(baseline, null, 2));
|
|
47
|
+
|
|
48
|
+
const totalEntries = Object.keys(baseline.eval_scenarios).length + Object.keys(baseline.contracts).length;
|
|
49
|
+
return { saved: totalEntries };
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Load baseline from file.
|
|
54
|
+
* @param {(path) => string} readFile
|
|
55
|
+
* @param {string} evalDir
|
|
56
|
+
* @returns {object|null}
|
|
57
|
+
*/
|
|
58
|
+
function loadBaseline(readFile, evalDir) {
|
|
59
|
+
try {
|
|
60
|
+
const content = readFile(`${evalDir}/${BASELINE_FILENAME}`);
|
|
61
|
+
return JSON.parse(content);
|
|
62
|
+
} catch {
|
|
63
|
+
return null;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Compare current results against baseline. Detect regressions.
|
|
69
|
+
* @param {object} currentResults — from runAllScenarios
|
|
70
|
+
* @param {object} currentContracts — from validateAllContracts
|
|
71
|
+
* @param {object} baseline — from loadBaseline
|
|
72
|
+
* @returns {{ regressions: Array, improvements: Array, new_scenarios: Array, summary }}
|
|
73
|
+
*/
|
|
74
|
+
function detectRegressions(currentResults, currentContracts, baseline) {
|
|
75
|
+
const regressions = [];
|
|
76
|
+
const improvements = [];
|
|
77
|
+
const newScenarios = [];
|
|
78
|
+
|
|
79
|
+
// Check eval scenarios
|
|
80
|
+
for (const s of currentResults.scenarios) {
|
|
81
|
+
const prev = baseline.eval_scenarios[s.id];
|
|
82
|
+
if (!prev) {
|
|
83
|
+
newScenarios.push({ type: 'scenario', id: s.id, agent: s.agent, result: s.result });
|
|
84
|
+
continue;
|
|
85
|
+
}
|
|
86
|
+
if (prev.result === 'pass' && s.result !== 'pass') {
|
|
87
|
+
regressions.push({
|
|
88
|
+
type: 'scenario',
|
|
89
|
+
id: s.id,
|
|
90
|
+
agent: s.agent,
|
|
91
|
+
was: 'pass',
|
|
92
|
+
now: s.result,
|
|
93
|
+
failures: s.failures.map(f => f.detail),
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
if (prev.result !== 'pass' && s.result === 'pass') {
|
|
97
|
+
improvements.push({ type: 'scenario', id: s.id, agent: s.agent, was: prev.result, now: 'pass' });
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Check contracts
|
|
102
|
+
for (const r of currentContracts.results) {
|
|
103
|
+
const prev = baseline.contracts[r.agent];
|
|
104
|
+
if (!prev) {
|
|
105
|
+
newScenarios.push({ type: 'contract', agent: r.agent, passed: r.passed });
|
|
106
|
+
continue;
|
|
107
|
+
}
|
|
108
|
+
if (prev.passed && !r.passed) {
|
|
109
|
+
regressions.push({
|
|
110
|
+
type: 'contract',
|
|
111
|
+
agent: r.agent,
|
|
112
|
+
was: 'pass',
|
|
113
|
+
now: 'fail',
|
|
114
|
+
failures: r.failures.map(f => f.detail),
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
if (!prev.passed && r.passed) {
|
|
118
|
+
improvements.push({ type: 'contract', agent: r.agent, was: 'fail', now: 'pass' });
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Check for removed scenarios (in baseline but not in current)
|
|
123
|
+
const removedScenarios = [];
|
|
124
|
+
const currentIds = new Set(currentResults.scenarios.map(s => s.id));
|
|
125
|
+
for (const [id, prev] of Object.entries(baseline.eval_scenarios)) {
|
|
126
|
+
if (!currentIds.has(id)) {
|
|
127
|
+
removedScenarios.push({ type: 'scenario', id, agent: prev.agent, was: prev.result });
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
const currentAgents = new Set(currentContracts.results.map(r => r.agent));
|
|
131
|
+
for (const [agent, prev] of Object.entries(baseline.contracts)) {
|
|
132
|
+
if (!currentAgents.has(agent)) {
|
|
133
|
+
removedScenarios.push({ type: 'contract', agent, was: prev.passed ? 'pass' : 'fail' });
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return {
|
|
138
|
+
regressions,
|
|
139
|
+
improvements,
|
|
140
|
+
new_scenarios: newScenarios,
|
|
141
|
+
removed: removedScenarios,
|
|
142
|
+
summary: {
|
|
143
|
+
regressions: regressions.length,
|
|
144
|
+
improvements: improvements.length,
|
|
145
|
+
new: newScenarios.length,
|
|
146
|
+
removed: removedScenarios.length,
|
|
147
|
+
baseline_date: baseline.created_at,
|
|
148
|
+
},
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Format regression results.
|
|
154
|
+
*/
|
|
155
|
+
function formatRegressionResults(results, format = 'pretty') {
|
|
156
|
+
if (format === 'json') return JSON.stringify(results, null, 2);
|
|
157
|
+
|
|
158
|
+
const lines = ['', 'Hone AI — Prompt Regression Check', '==================================', ''];
|
|
159
|
+
lines.push(`Baseline: ${results.summary.baseline_date}`);
|
|
160
|
+
lines.push('');
|
|
161
|
+
|
|
162
|
+
if (results.regressions.length > 0) {
|
|
163
|
+
lines.push('REGRESSIONS (previously passing, now failing):');
|
|
164
|
+
for (const r of results.regressions) {
|
|
165
|
+
lines.push(` [!!] ${r.type === 'scenario' ? r.id : r.agent}: ${r.was} → ${r.now}`);
|
|
166
|
+
for (const f of r.failures) {
|
|
167
|
+
lines.push(` ${f}`);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
lines.push('');
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
if (results.improvements.length > 0) {
|
|
174
|
+
lines.push('Improvements (previously failing, now passing):');
|
|
175
|
+
for (const i of results.improvements) {
|
|
176
|
+
lines.push(` [ok] ${i.type === 'scenario' ? i.id : i.agent}: ${i.was} → ${i.now}`);
|
|
177
|
+
}
|
|
178
|
+
lines.push('');
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
if (results.new_scenarios.length > 0) {
|
|
182
|
+
lines.push(`New (${results.new_scenarios.length} scenarios/contracts not in baseline — run --snapshot to update)`);
|
|
183
|
+
lines.push('');
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
if (results.removed && results.removed.length > 0) {
|
|
187
|
+
lines.push('Removed (in baseline but no longer exist — coverage gap):');
|
|
188
|
+
for (const r of results.removed) {
|
|
189
|
+
lines.push(` [??] ${r.type === 'scenario' ? r.id : r.agent}: was ${r.was}, now deleted`);
|
|
190
|
+
}
|
|
191
|
+
lines.push('');
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
lines.push('----------------------------------');
|
|
195
|
+
lines.push(`Summary: ${results.summary.regressions} regressions | ${results.summary.improvements} improvements | ${results.summary.new} new | ${results.summary.removed || 0} removed`);
|
|
196
|
+
lines.push(`Exit code: ${results.summary.regressions > 0 ? 1 : 0}`);
|
|
197
|
+
lines.push('');
|
|
198
|
+
|
|
199
|
+
return lines.join('\n');
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
module.exports = { saveBaseline, loadBaseline, detectRegressions, formatRegressionResults, BASELINE_FILENAME };
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* eval-runner.js — HC-019d eval runner for agent prompt quality.
|
|
4
|
+
*
|
|
5
|
+
* Loads eval scenarios from evals/<agent>/*.eval.yml, runs deterministic
|
|
6
|
+
* grading checks against agent prompt text (zero LLM tokens).
|
|
7
|
+
*
|
|
8
|
+
* Pure helper with injected I/O (readFile, listDir).
|
|
9
|
+
*/
|
|
10
|
+
const { runCheck } = require('./eval-graders');
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Load eval scenarios from the evals directory.
|
|
14
|
+
* @param {object} opts
|
|
15
|
+
* @param {string} opts.evalDir — path to evals/ directory
|
|
16
|
+
* @param {string} [opts.agent] — filter by agent name
|
|
17
|
+
* @param {string} [opts.tag] — filter by tag
|
|
18
|
+
* @param {string} [opts.scenarioId] — run single scenario by ID
|
|
19
|
+
* @param {(path: string) => string} opts.readFile
|
|
20
|
+
* @param {(path: string) => string[]} opts.listDir
|
|
21
|
+
* @param {(path: string) => boolean} opts.isDir
|
|
22
|
+
* @returns {Array<object>} scenarios
|
|
23
|
+
*/
|
|
24
|
+
function loadScenarios({ evalDir, agent, tag, scenarioId, readFile, listDir, isDir, parseYaml }) {
|
|
25
|
+
const scenarios = [];
|
|
26
|
+
const seenIds = new Set();
|
|
27
|
+
const agentDirs = listDir(evalDir).filter(d => !d.startsWith('_'));
|
|
28
|
+
|
|
29
|
+
for (const dir of agentDirs) {
|
|
30
|
+
if (agent && dir !== agent) continue;
|
|
31
|
+
const dirPath = `${evalDir}/${dir}`;
|
|
32
|
+
if (!isDir(dirPath)) continue;
|
|
33
|
+
|
|
34
|
+
const files = listDir(dirPath).filter(f => f.endsWith('.eval.yml'));
|
|
35
|
+
for (const file of files) {
|
|
36
|
+
try {
|
|
37
|
+
const content = readFile(`${dirPath}/${file}`);
|
|
38
|
+
const scenario = parseYaml(content);
|
|
39
|
+
scenario.evalAgent = dir;
|
|
40
|
+
scenario.evalFile = file;
|
|
41
|
+
|
|
42
|
+
if (scenarioId && scenario.id !== scenarioId) continue;
|
|
43
|
+
if (tag && !(scenario.tags || []).includes(tag)) continue;
|
|
44
|
+
|
|
45
|
+
if (scenario.id && seenIds.has(scenario.id)) {
|
|
46
|
+
scenarios.push({
|
|
47
|
+
id: scenario.id, evalAgent: dir, evalFile: file,
|
|
48
|
+
loadError: `duplicate scenario ID "${scenario.id}" (first seen in another file)`,
|
|
49
|
+
});
|
|
50
|
+
continue;
|
|
51
|
+
}
|
|
52
|
+
if (scenario.id) seenIds.add(scenario.id);
|
|
53
|
+
|
|
54
|
+
scenarios.push(scenario);
|
|
55
|
+
} catch (e) {
|
|
56
|
+
scenarios.push({
|
|
57
|
+
id: file, evalAgent: dir, evalFile: file,
|
|
58
|
+
loadError: e.message,
|
|
59
|
+
});
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
return scenarios;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Run grading checks for a single scenario against prompt text.
|
|
69
|
+
* @param {object} scenario — parsed eval scenario
|
|
70
|
+
* @param {string} promptText — agent prompt content
|
|
71
|
+
* @returns {{ id, agent, name, result, checks, failures }}
|
|
72
|
+
*/
|
|
73
|
+
function runScenario(scenario, promptText) {
|
|
74
|
+
if (scenario.loadError) {
|
|
75
|
+
return {
|
|
76
|
+
id: scenario.id,
|
|
77
|
+
agent: scenario.evalAgent,
|
|
78
|
+
name: scenario.evalFile,
|
|
79
|
+
result: 'error',
|
|
80
|
+
checks: 0,
|
|
81
|
+
checks_passed: 0,
|
|
82
|
+
failures: [{ type: 'load', passed: false, detail: scenario.loadError }],
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const checks = scenario.grading?.checks || [];
|
|
87
|
+
const results = checks.map(check => runCheck(promptText, check));
|
|
88
|
+
const passed = results.filter(r => r.passed).length;
|
|
89
|
+
const failures = results.filter(r => !r.passed);
|
|
90
|
+
|
|
91
|
+
return {
|
|
92
|
+
id: scenario.id,
|
|
93
|
+
agent: scenario.evalAgent,
|
|
94
|
+
name: scenario.name || scenario.id,
|
|
95
|
+
result: failures.length === 0 ? 'pass' : 'fail',
|
|
96
|
+
checks: checks.length,
|
|
97
|
+
checks_passed: passed,
|
|
98
|
+
failures,
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Run all scenarios against their agent prompts.
|
|
104
|
+
* @param {Array<object>} scenarios
|
|
105
|
+
* @param {object} agentPrompts — { agentName: promptText }
|
|
106
|
+
* @param {object} [opts]
|
|
107
|
+
* @param {boolean} [opts.failFast] — stop on first failure
|
|
108
|
+
* @returns {{ total, passed, failed, errors, scenarios: Array }}
|
|
109
|
+
*/
|
|
110
|
+
function runAllScenarios(scenarios, agentPrompts, opts = {}) {
|
|
111
|
+
const results = [];
|
|
112
|
+
|
|
113
|
+
for (const scenario of scenarios) {
|
|
114
|
+
const agentName = scenario.evalAgent || scenario.agent;
|
|
115
|
+
const promptText = agentPrompts[agentName];
|
|
116
|
+
|
|
117
|
+
if (!promptText && !scenario.loadError) {
|
|
118
|
+
results.push({
|
|
119
|
+
id: scenario.id,
|
|
120
|
+
agent: agentName,
|
|
121
|
+
name: scenario.name || scenario.id,
|
|
122
|
+
result: 'error',
|
|
123
|
+
checks: 0,
|
|
124
|
+
checks_passed: 0,
|
|
125
|
+
failures: [{ type: 'missing_prompt', passed: false, detail: `agent "${agentName}" not found in AGENT_PROMPTS` }],
|
|
126
|
+
});
|
|
127
|
+
continue;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
const result = runScenario(scenario, promptText || '');
|
|
131
|
+
results.push(result);
|
|
132
|
+
|
|
133
|
+
if (opts.failFast && result.result !== 'pass') break;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
return {
|
|
137
|
+
total: results.length,
|
|
138
|
+
passed: results.filter(r => r.result === 'pass').length,
|
|
139
|
+
failed: results.filter(r => r.result === 'fail').length,
|
|
140
|
+
errors: results.filter(r => r.result === 'error').length,
|
|
141
|
+
scenarios: results,
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Format results for display.
|
|
147
|
+
* @param {object} results — from runAllScenarios
|
|
148
|
+
* @param {'pretty'|'json'|'ci'} format
|
|
149
|
+
* @returns {string}
|
|
150
|
+
*/
|
|
151
|
+
function formatResults(results, format = 'pretty') {
|
|
152
|
+
if (format === 'json') return JSON.stringify(results, null, 2);
|
|
153
|
+
|
|
154
|
+
const lines = ['', 'Hone AI — Agent Eval Runner', '================================', ''];
|
|
155
|
+
|
|
156
|
+
// Group by agent
|
|
157
|
+
const byAgent = {};
|
|
158
|
+
for (const s of results.scenarios) {
|
|
159
|
+
if (!byAgent[s.agent]) byAgent[s.agent] = [];
|
|
160
|
+
byAgent[s.agent].push(s);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
for (const [agent, scenarios] of Object.entries(byAgent)) {
|
|
164
|
+
lines.push(`${agent} (${scenarios.length} scenarios)`);
|
|
165
|
+
for (const s of scenarios) {
|
|
166
|
+
const icon = s.result === 'pass' ? 'PASS' : s.result === 'fail' ? 'FAIL' : 'ERR ';
|
|
167
|
+
lines.push(` [${icon}] ${s.id}: ${s.name} (${s.checks_passed}/${s.checks} checks)`);
|
|
168
|
+
for (const f of s.failures) {
|
|
169
|
+
lines.push(` x ${f.type}: ${f.detail}`);
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
lines.push('');
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
lines.push('----------------------------------');
|
|
176
|
+
lines.push(`Summary: ${results.total} scenarios | ${results.passed} passed | ${results.failed} failed | ${results.errors} errors`);
|
|
177
|
+
lines.push(`Exit code: ${results.failed + results.errors > 0 ? 1 : 0}`);
|
|
178
|
+
lines.push('');
|
|
179
|
+
|
|
180
|
+
return lines.join('\n');
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
module.exports = { loadScenarios, runScenario, runAllScenarios, formatResults };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@hone-ai/cli",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.6.0",
|
|
4
4
|
"description": "Hone AI — Enterprise SDLC Pipeline CLI",
|
|
5
5
|
"main": "hone-cli.js",
|
|
6
6
|
"bin": {
|
|
@@ -14,7 +14,8 @@
|
|
|
14
14
|
],
|
|
15
15
|
"scripts": {
|
|
16
16
|
"test": "echo \"No tests yet\" && exit 0",
|
|
17
|
-
"link": "npm link"
|
|
17
|
+
"link": "npm link",
|
|
18
|
+
"postinstall": "echo '\\n Hone AI CLI installed successfully.\\n Next: run `hone init --token <YOUR_TOKEN>` to configure.\\n Docs: https://github.com/subbareddyvani/hone-server\\n'"
|
|
18
19
|
},
|
|
19
20
|
"dependencies": {
|
|
20
21
|
"ajv": "^8.20.0",
|