@hone-ai/cli 1.4.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/hone-cli.js CHANGED
@@ -1299,6 +1299,138 @@ program
1299
1299
  console.log('Verification complete.');
1300
1300
  });
1301
1301
 
1302
+ // ── USAGE command (#251) ─────────────────────────────────────────────────────
1303
+ program
1304
+ .command('usage')
1305
+ .description('Show current month token usage and budget status')
1306
+ .option('--format <fmt>', 'Output format: pretty | json', 'pretty')
1307
+ .action(async (opts) => {
1308
+ const config = getConfig();
1309
+ const client = api(config);
1310
+
1311
+ try {
1312
+ const { data } = await client.get('/usage/me');
1313
+
1314
+ if (opts.format === 'json') {
1315
+ console.log(JSON.stringify(data, null, 2));
1316
+ return;
1317
+ }
1318
+
1319
+ console.log('');
1320
+ console.log('Hone AI — Token Usage');
1321
+ console.log('================================');
1322
+ console.log(`Org: ${data.org}`);
1323
+ console.log(`Month: ${data.current_month}`);
1324
+ console.log(`Tokens used: ${data.used_tokens.toLocaleString()}`);
1325
+ console.log(`Cost (est): $${data.used_cost_usd.toFixed(2)}`);
1326
+
1327
+ if (data.monthly_budget != null) {
1328
+ console.log(`Budget: ${data.monthly_budget.toLocaleString()} tokens`);
1329
+ console.log(`Used: ${data.budget_pct}%`);
1330
+ console.log(`Remaining: ${data.remaining.toLocaleString()}`);
1331
+ if (data.exceeded) {
1332
+ console.log(`Status: EXCEEDED — resets ${data.resetsAt.split('T')[0]}`);
1333
+ } else if (data.budget_pct >= data.budget_alert_pct) {
1334
+ console.log(`Status: WARNING — approaching budget (${data.budget_pct}% of ${data.budget_alert_pct}% alert threshold)`);
1335
+ } else {
1336
+ console.log(`Status: OK`);
1337
+ }
1338
+ } else {
1339
+ console.log(`Budget: unlimited`);
1340
+ }
1341
+
1342
+ if (data.by_job && data.by_job.length > 0) {
1343
+ console.log('');
1344
+ console.log('Recent derive jobs:');
1345
+ for (const j of data.by_job.slice(0, 10)) {
1346
+ console.log(` ${j.date} | ${j.job_id} | ${j.tokens.toLocaleString()} tokens | $${Number(j.cost_usd).toFixed(2)}`);
1347
+ }
1348
+ }
1349
+
1350
+ console.log('');
1351
+ } catch (e) {
1352
+ if (e.response?.status === 401) {
1353
+ console.error('Not authenticated. Run: hone init');
1354
+ } else {
1355
+ console.error(`Failed to fetch usage: ${e.message}`);
1356
+ }
1357
+ process.exit(1);
1358
+ }
1359
+ });
1360
+
1361
+ // ── ADMIN-USAGE command ──────────────────────────────────────────────────────
1362
+ program
1363
+ .command('admin-usage')
1364
+ .description('Admin dashboard: cross-org token usage, budgets, alerts, trends')
1365
+ .option('--format <fmt>', 'Output format: pretty | json', 'pretty')
1366
+ .action(async (opts) => {
1367
+ const rc = readRc();
1368
+ const adminKey = process.env.HONE_ADMIN_KEY || rc.admin_key;
1369
+ const apiUrl = process.env.HONE_API || rc.api || 'https://api.hone.ai';
1370
+
1371
+ if (!adminKey) {
1372
+ console.error('Error: Admin key not found.');
1373
+ console.error('Set HONE_ADMIN_KEY env var, or add "admin_key" to ~/.honerc');
1374
+ process.exit(1);
1375
+ }
1376
+
1377
+ try {
1378
+ const { data } = await axios.get(`${apiUrl}/admin/usage`, {
1379
+ headers: { 'x-admin-key': adminKey, 'User-Agent': `@hone-ai/cli/${pkg.version}` },
1380
+ timeout: 15000,
1381
+ });
1382
+
1383
+ if (opts.format === 'json') {
1384
+ console.log(JSON.stringify(data, null, 2));
1385
+ return;
1386
+ }
1387
+
1388
+ console.log('');
1389
+ console.log('Hone AI — Admin Dashboard');
1390
+ console.log('================================');
1391
+ console.log(`Month: ${data.current_month}`);
1392
+ console.log(`Total orgs: ${data.platform_totals.total_orgs} (${data.platform_totals.active_orgs} active)`);
1393
+ console.log(`Total tokens: ${data.platform_totals.total_tokens.toLocaleString()}`);
1394
+ console.log(`Total cost: $${data.platform_totals.total_cost_usd.toFixed(2)}`);
1395
+ console.log(`Total calls: ${data.platform_totals.total_calls}`);
1396
+
1397
+ if (data.alerts.length > 0) {
1398
+ console.log('');
1399
+ console.log('Alerts:');
1400
+ for (const a of data.alerts) {
1401
+ const icon = a.level === 'critical' ? '!!' : a.level === 'warning' ? ' !' : ' i';
1402
+ console.log(` [${icon}] ${a.org}: ${a.message}`);
1403
+ }
1404
+ }
1405
+
1406
+ if (data.orgs.length > 0) {
1407
+ console.log('');
1408
+ console.log('Per-org usage:');
1409
+ console.log(' Org Tier Tokens Cost Budget% Trend Fails');
1410
+ console.log(' --- ---- ------ ---- ------- ----- -----');
1411
+ for (const o of data.orgs) {
1412
+ const name = o.org.padEnd(20).slice(0, 20);
1413
+ const tier = (o.tier || '').padEnd(10).slice(0, 10);
1414
+ const tokens = String(o.total_tokens.toLocaleString()).padStart(12);
1415
+ const cost = ('$' + o.total_cost_usd.toFixed(2)).padStart(8);
1416
+ const pct = o.monthly_budget != null ? (o.budget_pct + '%').padStart(8) : ' n/a';
1417
+ const trend = o.trend_pct != null ? ((o.trend_pct >= 0 ? '+' : '') + o.trend_pct + '%').padStart(6) : ' n/a';
1418
+ const fails = String(o.failed_jobs).padStart(5);
1419
+ console.log(` ${name} ${tier} ${tokens} ${cost} ${pct} ${trend} ${fails}`);
1420
+ }
1421
+ }
1422
+
1423
+ console.log('');
1424
+ } catch (e) {
1425
+ if (e.response?.status === 401) {
1426
+ console.error('Invalid admin key. Check HONE_ADMIN_KEY or ~/.honerc admin_key.');
1427
+ } else {
1428
+ console.error(`Failed to fetch admin dashboard: ${e.message}`);
1429
+ }
1430
+ process.exit(1);
1431
+ }
1432
+ });
1433
+
1302
1434
  // ── SYNC command ──────────────────────────────────────────────────────────────
1303
1435
  program
1304
1436
  .command('sync')
@@ -2618,8 +2750,9 @@ program
2618
2750
  }
2619
2751
  const storyId = ps.extractStoryIdFromBranch(branch);
2620
2752
  if (!storyId) {
2621
- console.error(`Could not derive STORY-ID from branch '${branch}'.`);
2622
- process.exit(1);
2753
+ console.log(`No story branch active (current: ${branch}).`);
2754
+ console.log('Switch to a story branch (e.g., feat/HC-001-description) to use hone next.');
2755
+ process.exit(0);
2623
2756
  }
2624
2757
  const storyDir = path.join(pipelineRoot, storyId);
2625
2758
  const metaPath = path.join(storyDir, 'metadata.yml');
@@ -3997,6 +4130,110 @@ program
3997
4130
  }, null, 2));
3998
4131
  });
3999
4132
 
4133
+ // ── HC-019d: Agent Eval Runner ────────────────────────────────────────────────
4134
+ program
4135
+ .command('eval')
4136
+ .description('Run eval scenarios against agent prompts (deterministic, zero LLM tokens)')
4137
+ .option('--agent <name>', 'Run evals for a specific agent only')
4138
+ .option('--tag <tag>', 'Filter scenarios by tag (e.g., smoke, regression)')
4139
+ .option('--scenario <id>', 'Run a single scenario by ID')
4140
+ .option('--format <fmt>', 'Output format: pretty | json', 'pretty')
4141
+ .option('--evals-dir <path>', 'Override eval scenarios directory')
4142
+ .option('--fail-fast', 'Stop on first failure')
4143
+ .option('--contracts', 'Run contract validation between pipeline agents')
4144
+ .option('--snapshot', 'Save current eval + contract results as regression baseline')
4145
+ .option('--regression', 'Compare current results against saved baseline (detect drift)')
4146
+ .action(async (opts) => {
4147
+ const path = require('path');
4148
+ const fs = require('fs');
4149
+ const yaml = require('js-yaml');
4150
+
4151
+ // Load agent prompts from seed-agent-prompts.js
4152
+ const seedPath = path.resolve(__dirname, '..', 'scripts', 'seed-agent-prompts.js');
4153
+ const { AGENT_PROMPTS } = require(seedPath);
4154
+ const evalDir = opts.evalsDir || path.resolve(__dirname, '..', 'evals');
4155
+
4156
+ // Snapshot mode (HC-019h): save baseline
4157
+ if (opts.snapshot) {
4158
+ const { loadScenarios, runAllScenarios } = require('./lib/eval-runner');
4159
+ const { validateAllContracts } = require('./lib/eval-contracts');
4160
+ const { saveBaseline } = require('./lib/eval-regression');
4161
+
4162
+ const scenarios = loadScenarios({
4163
+ evalDir, readFile: (p) => fs.readFileSync(p, 'utf8'),
4164
+ listDir: (p) => fs.readdirSync(p), isDir: (p) => fs.statSync(p).isDirectory(),
4165
+ parseYaml: (text) => yaml.load(text),
4166
+ });
4167
+ const evalResults = runAllScenarios(scenarios, AGENT_PROMPTS);
4168
+ const contractResults = validateAllContracts(AGENT_PROMPTS);
4169
+ const { saved } = saveBaseline(evalResults, contractResults,
4170
+ (p, c) => fs.writeFileSync(p, c, 'utf8'), evalDir);
4171
+ console.log(`Baseline saved: ${saved} entries → evals/.baseline.json`);
4172
+ process.exit(0);
4173
+ }
4174
+
4175
+ // Regression mode (HC-019h): compare against baseline
4176
+ if (opts.regression) {
4177
+ const { loadScenarios, runAllScenarios } = require('./lib/eval-runner');
4178
+ const { validateAllContracts } = require('./lib/eval-contracts');
4179
+ const { loadBaseline, detectRegressions, formatRegressionResults } = require('./lib/eval-regression');
4180
+
4181
+ const baseline = loadBaseline((p) => fs.readFileSync(p, 'utf8'), evalDir);
4182
+ if (!baseline) {
4183
+ console.error('No baseline found. Run: hone eval --snapshot');
4184
+ process.exit(1);
4185
+ }
4186
+
4187
+ const scenarios = loadScenarios({
4188
+ evalDir, readFile: (p) => fs.readFileSync(p, 'utf8'),
4189
+ listDir: (p) => fs.readdirSync(p), isDir: (p) => fs.statSync(p).isDirectory(),
4190
+ parseYaml: (text) => yaml.load(text),
4191
+ });
4192
+ const evalResults = runAllScenarios(scenarios, AGENT_PROMPTS);
4193
+ const contractResults = validateAllContracts(AGENT_PROMPTS);
4194
+ const results = detectRegressions(evalResults, contractResults, baseline);
4195
+ console.log(formatRegressionResults(results, opts.format));
4196
+ process.exit(results.summary.regressions > 0 ? 1 : 0);
4197
+ }
4198
+
4199
+ // Contract validation mode (HC-019g)
4200
+ if (opts.contracts) {
4201
+ const { validateAllContracts, formatContractResults } = require('./lib/eval-contracts');
4202
+ const results = validateAllContracts(AGENT_PROMPTS);
4203
+ console.log(formatContractResults(results, opts.format));
4204
+ process.exit(results.failed > 0 ? 1 : 0);
4205
+ }
4206
+
4207
+ // Scenario evaluation mode
4208
+ const { loadScenarios, runAllScenarios, formatResults } = require('./lib/eval-runner');
4209
+
4210
+ if (!fs.existsSync(evalDir)) {
4211
+ console.error(`Eval directory not found: ${evalDir}`);
4212
+ process.exit(1);
4213
+ }
4214
+
4215
+ const scenarios = loadScenarios({
4216
+ evalDir,
4217
+ agent: opts.agent,
4218
+ tag: opts.tag,
4219
+ scenarioId: opts.scenario,
4220
+ readFile: (p) => fs.readFileSync(p, 'utf8'),
4221
+ listDir: (p) => fs.readdirSync(p),
4222
+ isDir: (p) => fs.statSync(p).isDirectory(),
4223
+ parseYaml: (text) => yaml.load(text),
4224
+ });
4225
+
4226
+ if (scenarios.length === 0) {
4227
+ console.log('No eval scenarios found matching filters.');
4228
+ process.exit(0);
4229
+ }
4230
+
4231
+ const results = runAllScenarios(scenarios, AGENT_PROMPTS, { failFast: opts.failFast });
4232
+ console.log(formatResults(results, opts.format));
4233
+
4234
+ process.exit(results.failed + results.errors > 0 ? 1 : 0);
4235
+ });
4236
+
4000
4237
  // ── CLI setup ─────────────────────────────────────────────────────────────────
4001
4238
  program
4002
4239
  .name('hone')
@@ -90,6 +90,10 @@ function checkBindDefault(args) {
90
90
  if (/\bos\.getenv\b|\bprocess\.env\b|\bgetenv\(/.test(content)) continue;
91
91
  // Filter out lines that are clearly comments mentioning the anti-pattern
92
92
  if (/^\s*(#|\/\/|--).*0\.0\.0\.0/.test(content)) continue;
93
+ // Filter out docstrings/multiline strings (Python """, JS template literals, block comments)
94
+ if (/^\s*("""|'''|\/\*|\*)/.test(content)) continue;
95
+ // Filter out lines that describe/document the pattern rather than use it
96
+ if (/override|example|usage|default.*is|should|must|can\s/i.test(content) && !/=\s*['"]0\.0\.0\.0/.test(content)) continue;
93
97
 
94
98
  offenders.push({ file, line: lineNo, content: content.trim().slice(0, 100) });
95
99
  }
@@ -0,0 +1,256 @@
1
+ 'use strict';
2
+ /**
3
+ * eval-contracts.js — HC-019g contract testing between pipeline agents.
4
+ *
5
+ * Validates that each agent's prompt correctly references:
6
+ * 1. The prior step's artifact (input dependency)
7
+ * 2. The metadata.yml gate check (validation contract)
8
+ * 3. The output artifact it produces (output contract)
9
+ *
10
+ * This is a deterministic structural check — zero LLM tokens.
11
+ */
12
+
13
+ /**
14
+ * Pipeline agent contracts.
15
+ * Each entry defines what the agent must validate (input) and produce (output).
16
+ */
17
+ const PIPELINE_CONTRACTS = [
18
+ {
19
+ agent: 'story-groomer',
20
+ step: 0,
21
+ inputArtifact: null, // first in pipeline — no prior step
22
+ inputGate: null,
23
+ outputArtifact: 'step-0-grooming.md',
24
+ outputGate: 'step_0',
25
+ metadataField: 'step_0.gate_result',
26
+ extraChecks: [
27
+ { text: 'metadata.yml', check: 'creates_metadata', detail: 'creates metadata.yml' },
28
+ { text: 'Initialize Pipeline', check: 'pipeline_init', detail: 'initializes pipeline directory' },
29
+ { text: '.github/pipeline/', check: 'pipeline_dir', detail: 'creates .github/pipeline/ directory' },
30
+ ],
31
+ },
32
+ {
33
+ agent: 'implementation-planner',
34
+ step: 1,
35
+ inputArtifact: 'step-0-grooming.md',
36
+ inputGate: 'step_0.gate_result',
37
+ outputArtifact: 'step-1-plan.md',
38
+ outputGate: 'step_1',
39
+ metadataField: 'step_1.gate_result',
40
+ },
41
+ {
42
+ agent: 'unit-test-case-writer',
43
+ step: 2,
44
+ inputArtifact: 'step-1-plan.md',
45
+ inputGate: 'step_1.gate_result',
46
+ outputArtifact: 'step-2-tests.md',
47
+ outputGate: 'step_2',
48
+ metadataField: 'step_2.gate_result',
49
+ },
50
+ {
51
+ agent: 'e2e-qa-planner',
52
+ step: 3,
53
+ inputArtifact: 'step-2-tests.md',
54
+ inputGate: 'step_2.gate_result',
55
+ outputArtifact: 'step-3-e2e-plan.md',
56
+ outputGate: 'step_3',
57
+ metadataField: 'step_3.gate_result',
58
+ },
59
+ {
60
+ agent: 'code-builder',
61
+ step: 4,
62
+ inputArtifact: 'step-3-e2e-plan.md',
63
+ inputGate: 'step_3.gate_result',
64
+ outputArtifact: 'step-4-implementation.md',
65
+ outputGate: 'step_4',
66
+ metadataField: 'step_4.gate_result',
67
+ },
68
+ {
69
+ agent: 'code-reviewer',
70
+ step: 5,
71
+ inputArtifact: 'step-4-implementation.md',
72
+ inputGate: 'step_4.gate_result',
73
+ outputArtifact: 'step-5-review.md',
74
+ outputGate: 'step_5',
75
+ metadataField: 'step_5.gate_result',
76
+ },
77
+ {
78
+ agent: 'security-agent',
79
+ step: '5d',
80
+ inputArtifact: 'step-5-review.md',
81
+ inputGate: 'step_5.status',
82
+ outputArtifact: 'step-5d-security.md',
83
+ outputGate: null,
84
+ metadataField: null,
85
+ },
86
+ {
87
+ agent: 'performance-agent',
88
+ step: '5e',
89
+ inputArtifact: 'step-5-review.md',
90
+ inputGate: 'step_5.status',
91
+ outputArtifact: 'step-5e-performance.md',
92
+ outputGate: null,
93
+ metadataField: null,
94
+ },
95
+ {
96
+ agent: 'delivery-architect',
97
+ step: 'independent',
98
+ inputArtifact: null,
99
+ inputGate: null,
100
+ outputArtifact: 'EXECUTION_PLAN.yml',
101
+ outputGate: null,
102
+ metadataField: null,
103
+ extraChecks: [
104
+ { text: 'Checklist A', check: 'checklist_a', detail: 'has Checklist A (epic decomposition)' },
105
+ { text: 'Checklist B', check: 'checklist_b', detail: 'has Checklist B (architecture validation)' },
106
+ { text: 'test_strategy', check: 'test_strategy', detail: 'includes test_strategy in plan' },
107
+ ],
108
+ },
109
+ ];
110
+
111
+ /**
112
+ * Validate agent prompt against its contract.
113
+ * @param {string} promptText — agent prompt content
114
+ * @param {object} contract — from PIPELINE_CONTRACTS
115
+ * @returns {{ agent, checks: Array<{ check, passed, detail }> }}
116
+ */
117
+ function validateContract(promptText, contract) {
118
+ const checks = [];
119
+
120
+ // Check 1: Input artifact reference (skip for story-groomer — first in pipeline)
121
+ if (contract.inputArtifact) {
122
+ const found = promptText.includes(contract.inputArtifact);
123
+ checks.push({
124
+ check: 'input_artifact',
125
+ passed: found,
126
+ detail: found
127
+ ? `references prior step "${contract.inputArtifact}"`
128
+ : `MISSING reference to prior step "${contract.inputArtifact}"`,
129
+ });
130
+ }
131
+
132
+ // Check 2: Input gate validation
133
+ if (contract.inputGate) {
134
+ const found = promptText.includes(contract.inputGate);
135
+ checks.push({
136
+ check: 'input_gate',
137
+ passed: found,
138
+ detail: found
139
+ ? `validates "${contract.inputGate}" before proceeding`
140
+ : `MISSING validation of "${contract.inputGate}"`,
141
+ });
142
+ }
143
+
144
+ // Check 3: Output artifact reference
145
+ if (contract.outputArtifact) {
146
+ const found = promptText.includes(contract.outputArtifact);
147
+ checks.push({
148
+ check: 'output_artifact',
149
+ passed: found,
150
+ detail: found
151
+ ? `produces "${contract.outputArtifact}"`
152
+ : `MISSING output artifact "${contract.outputArtifact}"`,
153
+ });
154
+ }
155
+
156
+ // Check 4: metadata.yml update
157
+ if (contract.metadataField) {
158
+ const found = promptText.includes('metadata.yml');
159
+ checks.push({
160
+ check: 'metadata_update',
161
+ passed: found,
162
+ detail: found
163
+ ? `updates metadata.yml`
164
+ : `MISSING metadata.yml update`,
165
+ });
166
+ }
167
+
168
+ // Check 5: PIPELINE VALIDATION block
169
+ if (contract.inputArtifact) {
170
+ const found = promptText.includes('PIPELINE VALIDATION');
171
+ checks.push({
172
+ check: 'pipeline_validation_block',
173
+ passed: found,
174
+ detail: found
175
+ ? `has PIPELINE VALIDATION block`
176
+ : `MISSING PIPELINE VALIDATION block`,
177
+ });
178
+ }
179
+
180
+ // Check 6: Extra checks (agent-specific)
181
+ if (contract.extraChecks) {
182
+ for (const ec of contract.extraChecks) {
183
+ const found = promptText.includes(ec.text);
184
+ checks.push({
185
+ check: ec.check,
186
+ passed: found,
187
+ detail: found ? ec.detail : `MISSING: ${ec.detail}`,
188
+ });
189
+ }
190
+ }
191
+
192
+ return {
193
+ agent: contract.agent,
194
+ step: contract.step,
195
+ checks,
196
+ passed: checks.every(c => c.passed),
197
+ failures: checks.filter(c => !c.passed),
198
+ };
199
+ }
200
+
201
+ /**
202
+ * Run all contract validations.
203
+ * @param {object} agentPrompts — { agentName: promptText }
204
+ * @returns {{ total, passed, failed, results }}
205
+ */
206
+ function validateAllContracts(agentPrompts) {
207
+ const results = [];
208
+
209
+ for (const contract of PIPELINE_CONTRACTS) {
210
+ const promptText = agentPrompts[contract.agent];
211
+ if (!promptText) {
212
+ results.push({
213
+ agent: contract.agent,
214
+ step: contract.step,
215
+ checks: [],
216
+ passed: false,
217
+ failures: [{ check: 'agent_exists', passed: false, detail: `agent "${contract.agent}" not found` }],
218
+ });
219
+ continue;
220
+ }
221
+ results.push(validateContract(promptText, contract));
222
+ }
223
+
224
+ return {
225
+ total: results.length,
226
+ passed: results.filter(r => r.passed).length,
227
+ failed: results.filter(r => !r.passed).length,
228
+ results,
229
+ };
230
+ }
231
+
232
+ /**
233
+ * Format contract results.
234
+ */
235
+ function formatContractResults(results, format = 'pretty') {
236
+ if (format === 'json') return JSON.stringify(results, null, 2);
237
+
238
+ const lines = ['', 'Hone AI — Agent Contract Validation', '====================================', ''];
239
+
240
+ for (const r of results.results) {
241
+ const icon = r.passed ? 'PASS' : 'FAIL';
242
+ lines.push(`[${icon}] Step ${r.step}: ${r.agent} (${r.checks.length} checks)`);
243
+ for (const f of r.failures) {
244
+ lines.push(` x ${f.check}: ${f.detail}`);
245
+ }
246
+ }
247
+
248
+ lines.push('');
249
+ lines.push('----------------------------------');
250
+ lines.push(`Summary: ${results.total} agents | ${results.passed} passed | ${results.failed} failed`);
251
+ lines.push('');
252
+
253
+ return lines.join('\n');
254
+ }
255
+
256
+ module.exports = { PIPELINE_CONTRACTS, validateContract, validateAllContracts, formatContractResults };
@@ -0,0 +1,99 @@
1
+ 'use strict';
2
+ /**
3
+ * eval-graders.js — HC-019d deterministic grading checks for agent eval scenarios.
4
+ *
5
+ * Each grader is a pure function: (text, config) => { passed, detail }
6
+ * Zero LLM tokens — string/regex/structural checks only.
7
+ */
8
+
9
+ function contains(text, { value, case_insensitive = false }) {
10
+ const haystack = case_insensitive ? text.toLowerCase() : text;
11
+ const needle = case_insensitive ? value.toLowerCase() : value;
12
+ const found = haystack.includes(needle);
13
+ return { passed: found, detail: found ? `found "${value}"` : `"${value}" NOT FOUND` };
14
+ }
15
+
16
+ function notContains(text, { value, case_insensitive = false }) {
17
+ const haystack = case_insensitive ? text.toLowerCase() : text;
18
+ const needle = case_insensitive ? value.toLowerCase() : value;
19
+ const found = haystack.includes(needle);
20
+ return { passed: !found, detail: found ? `"${value}" FOUND (should be absent)` : `"${value}" correctly absent` };
21
+ }
22
+
23
+ function regex(text, { pattern, flags = '' }) {
24
+ try {
25
+ const re = new RegExp(pattern, flags);
26
+ const match = re.test(text);
27
+ return { passed: match, detail: match ? `matched /${pattern}/` : `/${pattern}/ did NOT match` };
28
+ } catch (e) {
29
+ return { passed: false, detail: `invalid regex /${pattern}/: ${e.message}` };
30
+ }
31
+ }
32
+
33
+ function sectionExists(text, { heading }) {
34
+ const re = new RegExp(`^#{1,4}\\s+${escapeRegex(heading)}`, 'mi');
35
+ const found = re.test(text);
36
+ return { passed: found, detail: found ? `section "${heading}" found` : `section "${heading}" NOT FOUND` };
37
+ }
38
+
39
+ function wordCount(text, { min = 0, max = Infinity }) {
40
+ const count = text.split(/\s+/).filter(Boolean).length;
41
+ const passed = count >= min && count <= max;
42
+ return { passed, detail: `${count} words (expected ${min}-${max === Infinity ? '∞' : max})` };
43
+ }
44
+
45
+ function jsonValid(text) {
46
+ try {
47
+ JSON.parse(text);
48
+ return { passed: true, detail: 'valid JSON' };
49
+ } catch (e) {
50
+ return { passed: false, detail: `invalid JSON: ${e.message}` };
51
+ }
52
+ }
53
+
54
+ function yamlValid(text) {
55
+ try {
56
+ require('js-yaml').load(text);
57
+ return { passed: true, detail: 'valid YAML' };
58
+ } catch (e) {
59
+ return { passed: false, detail: `invalid YAML: ${e.message}` };
60
+ }
61
+ }
62
+
63
+ function lineCount(text, { min = 0, max = Infinity }) {
64
+ const count = text.split('\n').length;
65
+ const passed = count >= min && count <= max;
66
+ return { passed, detail: `${count} lines (expected ${min}-${max === Infinity ? '∞' : max})` };
67
+ }
68
+
69
+ // ── Dispatch ─────────────────────────────────────────────────────
70
+
71
+ const GRADERS = {
72
+ contains,
73
+ not_contains: notContains,
74
+ regex,
75
+ section_exists: sectionExists,
76
+ word_count: wordCount,
77
+ json_valid: jsonValid,
78
+ yaml_valid: yamlValid,
79
+ line_count: lineCount,
80
+ };
81
+
82
+ /**
83
+ * Run a single grading check.
84
+ * @param {string} text — the text to grade (prompt content or LLM output)
85
+ * @param {{ type: string, ...config }} check
86
+ * @returns {{ type, passed, detail }}
87
+ */
88
+ function runCheck(text, check) {
89
+ const grader = GRADERS[check.type];
90
+ if (!grader) return { type: check.type, passed: false, detail: `unknown grader type "${check.type}"` };
91
+ const result = grader(text, check);
92
+ return { type: check.type, ...result };
93
+ }
94
+
95
+ function escapeRegex(str) {
96
+ return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
97
+ }
98
+
99
+ module.exports = { runCheck, GRADERS, contains, notContains, regex, sectionExists, wordCount, jsonValid, yamlValid, lineCount };
@@ -0,0 +1,202 @@
1
+ 'use strict';
2
+ /**
3
+ * eval-regression.js — HC-019h prompt regression testing.
4
+ *
5
+ * Saves eval results as a baseline snapshot, then detects drift
6
+ * when a prompt change causes a previously-passing eval to fail.
7
+ *
8
+ * Baseline file: evals/.baseline.json
9
+ * Pure helper with injected I/O.
10
+ */
11
+
12
+ const BASELINE_FILENAME = '.baseline.json';
13
+
14
+ /**
15
+ * Save current eval results as baseline.
16
+ * @param {object} results — from runAllScenarios
17
+ * @param {object} contractResults — from validateAllContracts
18
+ * @param {(path, content) => void} writeFile
19
+ * @param {string} evalDir
20
+ * @returns {{ saved: number }}
21
+ */
22
+ function saveBaseline(results, contractResults, writeFile, evalDir) {
23
+ const baseline = {
24
+ created_at: new Date().toISOString(),
25
+ eval_scenarios: {},
26
+ contracts: {},
27
+ };
28
+
29
+ for (const s of results.scenarios) {
30
+ baseline.eval_scenarios[s.id] = {
31
+ agent: s.agent,
32
+ result: s.result,
33
+ checks: s.checks,
34
+ checks_passed: s.checks_passed,
35
+ };
36
+ }
37
+
38
+ for (const r of contractResults.results) {
39
+ baseline.contracts[r.agent] = {
40
+ step: r.step,
41
+ passed: r.passed,
42
+ checks: r.checks.length,
43
+ };
44
+ }
45
+
46
+ writeFile(`${evalDir}/${BASELINE_FILENAME}`, JSON.stringify(baseline, null, 2));
47
+
48
+ const totalEntries = Object.keys(baseline.eval_scenarios).length + Object.keys(baseline.contracts).length;
49
+ return { saved: totalEntries };
50
+ }
51
+
52
+ /**
53
+ * Load baseline from file.
54
+ * @param {(path) => string} readFile
55
+ * @param {string} evalDir
56
+ * @returns {object|null}
57
+ */
58
+ function loadBaseline(readFile, evalDir) {
59
+ try {
60
+ const content = readFile(`${evalDir}/${BASELINE_FILENAME}`);
61
+ return JSON.parse(content);
62
+ } catch {
63
+ return null;
64
+ }
65
+ }
66
+
67
+ /**
68
+ * Compare current results against baseline. Detect regressions.
69
+ * @param {object} currentResults — from runAllScenarios
70
+ * @param {object} currentContracts — from validateAllContracts
71
+ * @param {object} baseline — from loadBaseline
72
+ * @returns {{ regressions: Array, improvements: Array, new_scenarios: Array, summary }}
73
+ */
74
+ function detectRegressions(currentResults, currentContracts, baseline) {
75
+ const regressions = [];
76
+ const improvements = [];
77
+ const newScenarios = [];
78
+
79
+ // Check eval scenarios
80
+ for (const s of currentResults.scenarios) {
81
+ const prev = baseline.eval_scenarios[s.id];
82
+ if (!prev) {
83
+ newScenarios.push({ type: 'scenario', id: s.id, agent: s.agent, result: s.result });
84
+ continue;
85
+ }
86
+ if (prev.result === 'pass' && s.result !== 'pass') {
87
+ regressions.push({
88
+ type: 'scenario',
89
+ id: s.id,
90
+ agent: s.agent,
91
+ was: 'pass',
92
+ now: s.result,
93
+ failures: s.failures.map(f => f.detail),
94
+ });
95
+ }
96
+ if (prev.result !== 'pass' && s.result === 'pass') {
97
+ improvements.push({ type: 'scenario', id: s.id, agent: s.agent, was: prev.result, now: 'pass' });
98
+ }
99
+ }
100
+
101
+ // Check contracts
102
+ for (const r of currentContracts.results) {
103
+ const prev = baseline.contracts[r.agent];
104
+ if (!prev) {
105
+ newScenarios.push({ type: 'contract', agent: r.agent, passed: r.passed });
106
+ continue;
107
+ }
108
+ if (prev.passed && !r.passed) {
109
+ regressions.push({
110
+ type: 'contract',
111
+ agent: r.agent,
112
+ was: 'pass',
113
+ now: 'fail',
114
+ failures: r.failures.map(f => f.detail),
115
+ });
116
+ }
117
+ if (!prev.passed && r.passed) {
118
+ improvements.push({ type: 'contract', agent: r.agent, was: 'fail', now: 'pass' });
119
+ }
120
+ }
121
+
122
+ // Check for removed scenarios (in baseline but not in current)
123
+ const removedScenarios = [];
124
+ const currentIds = new Set(currentResults.scenarios.map(s => s.id));
125
+ for (const [id, prev] of Object.entries(baseline.eval_scenarios)) {
126
+ if (!currentIds.has(id)) {
127
+ removedScenarios.push({ type: 'scenario', id, agent: prev.agent, was: prev.result });
128
+ }
129
+ }
130
+ const currentAgents = new Set(currentContracts.results.map(r => r.agent));
131
+ for (const [agent, prev] of Object.entries(baseline.contracts)) {
132
+ if (!currentAgents.has(agent)) {
133
+ removedScenarios.push({ type: 'contract', agent, was: prev.passed ? 'pass' : 'fail' });
134
+ }
135
+ }
136
+
137
+ return {
138
+ regressions,
139
+ improvements,
140
+ new_scenarios: newScenarios,
141
+ removed: removedScenarios,
142
+ summary: {
143
+ regressions: regressions.length,
144
+ improvements: improvements.length,
145
+ new: newScenarios.length,
146
+ removed: removedScenarios.length,
147
+ baseline_date: baseline.created_at,
148
+ },
149
+ };
150
+ }
151
+
152
+ /**
153
+ * Format regression results.
154
+ */
155
+ function formatRegressionResults(results, format = 'pretty') {
156
+ if (format === 'json') return JSON.stringify(results, null, 2);
157
+
158
+ const lines = ['', 'Hone AI — Prompt Regression Check', '==================================', ''];
159
+ lines.push(`Baseline: ${results.summary.baseline_date}`);
160
+ lines.push('');
161
+
162
+ if (results.regressions.length > 0) {
163
+ lines.push('REGRESSIONS (previously passing, now failing):');
164
+ for (const r of results.regressions) {
165
+ lines.push(` [!!] ${r.type === 'scenario' ? r.id : r.agent}: ${r.was} → ${r.now}`);
166
+ for (const f of r.failures) {
167
+ lines.push(` ${f}`);
168
+ }
169
+ }
170
+ lines.push('');
171
+ }
172
+
173
+ if (results.improvements.length > 0) {
174
+ lines.push('Improvements (previously failing, now passing):');
175
+ for (const i of results.improvements) {
176
+ lines.push(` [ok] ${i.type === 'scenario' ? i.id : i.agent}: ${i.was} → ${i.now}`);
177
+ }
178
+ lines.push('');
179
+ }
180
+
181
+ if (results.new_scenarios.length > 0) {
182
+ lines.push(`New (${results.new_scenarios.length} scenarios/contracts not in baseline — run --snapshot to update)`);
183
+ lines.push('');
184
+ }
185
+
186
+ if (results.removed && results.removed.length > 0) {
187
+ lines.push('Removed (in baseline but no longer exist — coverage gap):');
188
+ for (const r of results.removed) {
189
+ lines.push(` [??] ${r.type === 'scenario' ? r.id : r.agent}: was ${r.was}, now deleted`);
190
+ }
191
+ lines.push('');
192
+ }
193
+
194
+ lines.push('----------------------------------');
195
+ lines.push(`Summary: ${results.summary.regressions} regressions | ${results.summary.improvements} improvements | ${results.summary.new} new | ${results.summary.removed || 0} removed`);
196
+ lines.push(`Exit code: ${results.summary.regressions > 0 ? 1 : 0}`);
197
+ lines.push('');
198
+
199
+ return lines.join('\n');
200
+ }
201
+
202
+ module.exports = { saveBaseline, loadBaseline, detectRegressions, formatRegressionResults, BASELINE_FILENAME };
@@ -0,0 +1,183 @@
1
+ 'use strict';
2
+ /**
3
+ * eval-runner.js — HC-019d eval runner for agent prompt quality.
4
+ *
5
+ * Loads eval scenarios from evals/<agent>/*.eval.yml, runs deterministic
6
+ * grading checks against agent prompt text (zero LLM tokens).
7
+ *
8
+ * Pure helper with injected I/O (readFile, listDir).
9
+ */
10
+ const { runCheck } = require('./eval-graders');
11
+
12
+ /**
13
+ * Load eval scenarios from the evals directory.
14
+ * @param {object} opts
15
+ * @param {string} opts.evalDir — path to evals/ directory
16
+ * @param {string} [opts.agent] — filter by agent name
17
+ * @param {string} [opts.tag] — filter by tag
18
+ * @param {string} [opts.scenarioId] — run single scenario by ID
19
+ * @param {(path: string) => string} opts.readFile
20
+ * @param {(path: string) => string[]} opts.listDir
21
+ * @param {(path: string) => boolean} opts.isDir
22
+ * @returns {Array<object>} scenarios
23
+ */
24
+ function loadScenarios({ evalDir, agent, tag, scenarioId, readFile, listDir, isDir, parseYaml }) {
25
+ const scenarios = [];
26
+ const seenIds = new Set();
27
+ const agentDirs = listDir(evalDir).filter(d => !d.startsWith('_'));
28
+
29
+ for (const dir of agentDirs) {
30
+ if (agent && dir !== agent) continue;
31
+ const dirPath = `${evalDir}/${dir}`;
32
+ if (!isDir(dirPath)) continue;
33
+
34
+ const files = listDir(dirPath).filter(f => f.endsWith('.eval.yml'));
35
+ for (const file of files) {
36
+ try {
37
+ const content = readFile(`${dirPath}/${file}`);
38
+ const scenario = parseYaml(content);
39
+ scenario.evalAgent = dir;
40
+ scenario.evalFile = file;
41
+
42
+ if (scenarioId && scenario.id !== scenarioId) continue;
43
+ if (tag && !(scenario.tags || []).includes(tag)) continue;
44
+
45
+ if (scenario.id && seenIds.has(scenario.id)) {
46
+ scenarios.push({
47
+ id: scenario.id, evalAgent: dir, evalFile: file,
48
+ loadError: `duplicate scenario ID "${scenario.id}" (first seen in another file)`,
49
+ });
50
+ continue;
51
+ }
52
+ if (scenario.id) seenIds.add(scenario.id);
53
+
54
+ scenarios.push(scenario);
55
+ } catch (e) {
56
+ scenarios.push({
57
+ id: file, evalAgent: dir, evalFile: file,
58
+ loadError: e.message,
59
+ });
60
+ }
61
+ }
62
+ }
63
+
64
+ return scenarios;
65
+ }
66
+
67
+ /**
68
+ * Run grading checks for a single scenario against prompt text.
69
+ * @param {object} scenario — parsed eval scenario
70
+ * @param {string} promptText — agent prompt content
71
+ * @returns {{ id, agent, name, result, checks, failures }}
72
+ */
73
+ function runScenario(scenario, promptText) {
74
+ if (scenario.loadError) {
75
+ return {
76
+ id: scenario.id,
77
+ agent: scenario.evalAgent,
78
+ name: scenario.evalFile,
79
+ result: 'error',
80
+ checks: 0,
81
+ checks_passed: 0,
82
+ failures: [{ type: 'load', passed: false, detail: scenario.loadError }],
83
+ };
84
+ }
85
+
86
+ const checks = scenario.grading?.checks || [];
87
+ const results = checks.map(check => runCheck(promptText, check));
88
+ const passed = results.filter(r => r.passed).length;
89
+ const failures = results.filter(r => !r.passed);
90
+
91
+ return {
92
+ id: scenario.id,
93
+ agent: scenario.evalAgent,
94
+ name: scenario.name || scenario.id,
95
+ result: failures.length === 0 ? 'pass' : 'fail',
96
+ checks: checks.length,
97
+ checks_passed: passed,
98
+ failures,
99
+ };
100
+ }
101
+
102
+ /**
103
+ * Run all scenarios against their agent prompts.
104
+ * @param {Array<object>} scenarios
105
+ * @param {object} agentPrompts — { agentName: promptText }
106
+ * @param {object} [opts]
107
+ * @param {boolean} [opts.failFast] — stop on first failure
108
+ * @returns {{ total, passed, failed, errors, scenarios: Array }}
109
+ */
110
+ function runAllScenarios(scenarios, agentPrompts, opts = {}) {
111
+ const results = [];
112
+
113
+ for (const scenario of scenarios) {
114
+ const agentName = scenario.evalAgent || scenario.agent;
115
+ const promptText = agentPrompts[agentName];
116
+
117
+ if (!promptText && !scenario.loadError) {
118
+ results.push({
119
+ id: scenario.id,
120
+ agent: agentName,
121
+ name: scenario.name || scenario.id,
122
+ result: 'error',
123
+ checks: 0,
124
+ checks_passed: 0,
125
+ failures: [{ type: 'missing_prompt', passed: false, detail: `agent "${agentName}" not found in AGENT_PROMPTS` }],
126
+ });
127
+ continue;
128
+ }
129
+
130
+ const result = runScenario(scenario, promptText || '');
131
+ results.push(result);
132
+
133
+ if (opts.failFast && result.result !== 'pass') break;
134
+ }
135
+
136
+ return {
137
+ total: results.length,
138
+ passed: results.filter(r => r.result === 'pass').length,
139
+ failed: results.filter(r => r.result === 'fail').length,
140
+ errors: results.filter(r => r.result === 'error').length,
141
+ scenarios: results,
142
+ };
143
+ }
144
+
145
+ /**
146
+ * Format results for display.
147
+ * @param {object} results — from runAllScenarios
148
+ * @param {'pretty'|'json'|'ci'} format
149
+ * @returns {string}
150
+ */
151
+ function formatResults(results, format = 'pretty') {
152
+ if (format === 'json') return JSON.stringify(results, null, 2);
153
+
154
+ const lines = ['', 'Hone AI — Agent Eval Runner', '================================', ''];
155
+
156
+ // Group by agent
157
+ const byAgent = {};
158
+ for (const s of results.scenarios) {
159
+ if (!byAgent[s.agent]) byAgent[s.agent] = [];
160
+ byAgent[s.agent].push(s);
161
+ }
162
+
163
+ for (const [agent, scenarios] of Object.entries(byAgent)) {
164
+ lines.push(`${agent} (${scenarios.length} scenarios)`);
165
+ for (const s of scenarios) {
166
+ const icon = s.result === 'pass' ? 'PASS' : s.result === 'fail' ? 'FAIL' : 'ERR ';
167
+ lines.push(` [${icon}] ${s.id}: ${s.name} (${s.checks_passed}/${s.checks} checks)`);
168
+ for (const f of s.failures) {
169
+ lines.push(` x ${f.type}: ${f.detail}`);
170
+ }
171
+ }
172
+ lines.push('');
173
+ }
174
+
175
+ lines.push('----------------------------------');
176
+ lines.push(`Summary: ${results.total} scenarios | ${results.passed} passed | ${results.failed} failed | ${results.errors} errors`);
177
+ lines.push(`Exit code: ${results.failed + results.errors > 0 ? 1 : 0}`);
178
+ lines.push('');
179
+
180
+ return lines.join('\n');
181
+ }
182
+
183
+ module.exports = { loadScenarios, runScenario, runAllScenarios, formatResults };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hone-ai/cli",
3
- "version": "1.4.0",
3
+ "version": "1.6.0",
4
4
  "description": "Hone AI — Enterprise SDLC Pipeline CLI",
5
5
  "main": "hone-cli.js",
6
6
  "bin": {
@@ -14,7 +14,8 @@
14
14
  ],
15
15
  "scripts": {
16
16
  "test": "echo \"No tests yet\" && exit 0",
17
- "link": "npm link"
17
+ "link": "npm link",
18
+ "postinstall": "echo '\\n Hone AI CLI installed successfully.\\n Next: run `hone init --token <YOUR_TOKEN>` to configure.\\n Docs: https://github.com/subbareddyvani/hone-server\\n'"
18
19
  },
19
20
  "dependencies": {
20
21
  "ajv": "^8.20.0",