@hone-ai/cli 1.5.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/hone-cli.js +64 -7
- package/lib/doctor-bind-default.js +4 -0
- package/lib/eval-contracts.js +256 -0
- package/lib/eval-llm-judge.js +213 -0
- package/lib/eval-regression.js +202 -0
- package/lib/eval-runner.js +8 -5
- package/lib/eval-three-valued.js +158 -0
- package/package.json +3 -2
package/hone-cli.js
CHANGED
|
@@ -2750,8 +2750,9 @@ program
|
|
|
2750
2750
|
}
|
|
2751
2751
|
const storyId = ps.extractStoryIdFromBranch(branch);
|
|
2752
2752
|
if (!storyId) {
|
|
2753
|
-
console.
|
|
2754
|
-
|
|
2753
|
+
console.log(`No story branch active (current: ${branch}).`);
|
|
2754
|
+
console.log('Switch to a story branch (e.g., feat/HC-001-description) to use hone next.');
|
|
2755
|
+
process.exit(0);
|
|
2755
2756
|
}
|
|
2756
2757
|
const storyDir = path.join(pipelineRoot, storyId);
|
|
2757
2758
|
const metaPath = path.join(storyDir, 'metadata.yml');
|
|
@@ -4139,22 +4140,78 @@ program
|
|
|
4139
4140
|
.option('--format <fmt>', 'Output format: pretty | json', 'pretty')
|
|
4140
4141
|
.option('--evals-dir <path>', 'Override eval scenarios directory')
|
|
4141
4142
|
.option('--fail-fast', 'Stop on first failure')
|
|
4143
|
+
.option('--contracts', 'Run contract validation between pipeline agents')
|
|
4144
|
+
.option('--snapshot', 'Save current eval + contract results as regression baseline')
|
|
4145
|
+
.option('--regression', 'Compare current results against saved baseline (detect drift)')
|
|
4142
4146
|
.action(async (opts) => {
|
|
4143
4147
|
const path = require('path');
|
|
4144
4148
|
const fs = require('fs');
|
|
4145
4149
|
const yaml = require('js-yaml');
|
|
4146
|
-
const { loadScenarios, runAllScenarios, formatResults } = require('./lib/eval-runner');
|
|
4147
4150
|
|
|
4151
|
+
// Load agent prompts from seed-agent-prompts.js
|
|
4152
|
+
const seedPath = path.resolve(__dirname, '..', 'scripts', 'seed-agent-prompts.js');
|
|
4153
|
+
const { AGENT_PROMPTS } = require(seedPath);
|
|
4148
4154
|
const evalDir = opts.evalsDir || path.resolve(__dirname, '..', 'evals');
|
|
4155
|
+
|
|
4156
|
+
// Snapshot mode (HC-019h): save baseline
|
|
4157
|
+
if (opts.snapshot) {
|
|
4158
|
+
const { loadScenarios, runAllScenarios } = require('./lib/eval-runner');
|
|
4159
|
+
const { validateAllContracts } = require('./lib/eval-contracts');
|
|
4160
|
+
const { saveBaseline } = require('./lib/eval-regression');
|
|
4161
|
+
|
|
4162
|
+
const scenarios = loadScenarios({
|
|
4163
|
+
evalDir, readFile: (p) => fs.readFileSync(p, 'utf8'),
|
|
4164
|
+
listDir: (p) => fs.readdirSync(p), isDir: (p) => fs.statSync(p).isDirectory(),
|
|
4165
|
+
parseYaml: (text) => yaml.load(text),
|
|
4166
|
+
});
|
|
4167
|
+
const evalResults = runAllScenarios(scenarios, AGENT_PROMPTS);
|
|
4168
|
+
const contractResults = validateAllContracts(AGENT_PROMPTS);
|
|
4169
|
+
const { saved } = saveBaseline(evalResults, contractResults,
|
|
4170
|
+
(p, c) => fs.writeFileSync(p, c, 'utf8'), evalDir);
|
|
4171
|
+
console.log(`Baseline saved: ${saved} entries → evals/.baseline.json`);
|
|
4172
|
+
process.exit(0);
|
|
4173
|
+
}
|
|
4174
|
+
|
|
4175
|
+
// Regression mode (HC-019h): compare against baseline
|
|
4176
|
+
if (opts.regression) {
|
|
4177
|
+
const { loadScenarios, runAllScenarios } = require('./lib/eval-runner');
|
|
4178
|
+
const { validateAllContracts } = require('./lib/eval-contracts');
|
|
4179
|
+
const { loadBaseline, detectRegressions, formatRegressionResults } = require('./lib/eval-regression');
|
|
4180
|
+
|
|
4181
|
+
const baseline = loadBaseline((p) => fs.readFileSync(p, 'utf8'), evalDir);
|
|
4182
|
+
if (!baseline) {
|
|
4183
|
+
console.error('No baseline found. Run: hone eval --snapshot');
|
|
4184
|
+
process.exit(1);
|
|
4185
|
+
}
|
|
4186
|
+
|
|
4187
|
+
const scenarios = loadScenarios({
|
|
4188
|
+
evalDir, readFile: (p) => fs.readFileSync(p, 'utf8'),
|
|
4189
|
+
listDir: (p) => fs.readdirSync(p), isDir: (p) => fs.statSync(p).isDirectory(),
|
|
4190
|
+
parseYaml: (text) => yaml.load(text),
|
|
4191
|
+
});
|
|
4192
|
+
const evalResults = runAllScenarios(scenarios, AGENT_PROMPTS);
|
|
4193
|
+
const contractResults = validateAllContracts(AGENT_PROMPTS);
|
|
4194
|
+
const results = detectRegressions(evalResults, contractResults, baseline);
|
|
4195
|
+
console.log(formatRegressionResults(results, opts.format));
|
|
4196
|
+
process.exit(results.summary.regressions > 0 ? 1 : 0);
|
|
4197
|
+
}
|
|
4198
|
+
|
|
4199
|
+
// Contract validation mode (HC-019g)
|
|
4200
|
+
if (opts.contracts) {
|
|
4201
|
+
const { validateAllContracts, formatContractResults } = require('./lib/eval-contracts');
|
|
4202
|
+
const results = validateAllContracts(AGENT_PROMPTS);
|
|
4203
|
+
console.log(formatContractResults(results, opts.format));
|
|
4204
|
+
process.exit(results.failed > 0 ? 1 : 0);
|
|
4205
|
+
}
|
|
4206
|
+
|
|
4207
|
+
// Scenario evaluation mode
|
|
4208
|
+
const { loadScenarios, runAllScenarios, formatResults } = require('./lib/eval-runner');
|
|
4209
|
+
|
|
4149
4210
|
if (!fs.existsSync(evalDir)) {
|
|
4150
4211
|
console.error(`Eval directory not found: ${evalDir}`);
|
|
4151
4212
|
process.exit(1);
|
|
4152
4213
|
}
|
|
4153
4214
|
|
|
4154
|
-
// Load agent prompts from seed-agent-prompts.js
|
|
4155
|
-
const seedPath = path.resolve(__dirname, '..', 'scripts', 'seed-agent-prompts.js');
|
|
4156
|
-
const { AGENT_PROMPTS } = require(seedPath);
|
|
4157
|
-
|
|
4158
4215
|
const scenarios = loadScenarios({
|
|
4159
4216
|
evalDir,
|
|
4160
4217
|
agent: opts.agent,
|
|
@@ -90,6 +90,10 @@ function checkBindDefault(args) {
|
|
|
90
90
|
if (/\bos\.getenv\b|\bprocess\.env\b|\bgetenv\(/.test(content)) continue;
|
|
91
91
|
// Filter out lines that are clearly comments mentioning the anti-pattern
|
|
92
92
|
if (/^\s*(#|\/\/|--).*0\.0\.0\.0/.test(content)) continue;
|
|
93
|
+
// Filter out docstrings/multiline strings (Python """, JS template literals, block comments)
|
|
94
|
+
if (/^\s*("""|'''|\/\*|\*)/.test(content)) continue;
|
|
95
|
+
// Filter out lines that describe/document the pattern rather than use it
|
|
96
|
+
if (/override|example|usage|default.*is|should|must|can\s/i.test(content) && !/=\s*['"]0\.0\.0\.0/.test(content)) continue;
|
|
93
97
|
|
|
94
98
|
offenders.push({ file, line: lineNo, content: content.trim().slice(0, 100) });
|
|
95
99
|
}
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* eval-contracts.js — HC-019g contract testing between pipeline agents.
|
|
4
|
+
*
|
|
5
|
+
* Validates that each agent's prompt correctly references:
|
|
6
|
+
* 1. The prior step's artifact (input dependency)
|
|
7
|
+
* 2. The metadata.yml gate check (validation contract)
|
|
8
|
+
* 3. The output artifact it produces (output contract)
|
|
9
|
+
*
|
|
10
|
+
* This is a deterministic structural check — zero LLM tokens.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Pipeline agent contracts.
|
|
15
|
+
* Each entry defines what the agent must validate (input) and produce (output).
|
|
16
|
+
*/
|
|
17
|
+
const PIPELINE_CONTRACTS = [
|
|
18
|
+
{
|
|
19
|
+
agent: 'story-groomer',
|
|
20
|
+
step: 0,
|
|
21
|
+
inputArtifact: null, // first in pipeline — no prior step
|
|
22
|
+
inputGate: null,
|
|
23
|
+
outputArtifact: 'step-0-grooming.md',
|
|
24
|
+
outputGate: 'step_0',
|
|
25
|
+
metadataField: 'step_0.gate_result',
|
|
26
|
+
extraChecks: [
|
|
27
|
+
{ text: 'metadata.yml', check: 'creates_metadata', detail: 'creates metadata.yml' },
|
|
28
|
+
{ text: 'Initialize Pipeline', check: 'pipeline_init', detail: 'initializes pipeline directory' },
|
|
29
|
+
{ text: '.github/pipeline/', check: 'pipeline_dir', detail: 'creates .github/pipeline/ directory' },
|
|
30
|
+
],
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
agent: 'implementation-planner',
|
|
34
|
+
step: 1,
|
|
35
|
+
inputArtifact: 'step-0-grooming.md',
|
|
36
|
+
inputGate: 'step_0.gate_result',
|
|
37
|
+
outputArtifact: 'step-1-plan.md',
|
|
38
|
+
outputGate: 'step_1',
|
|
39
|
+
metadataField: 'step_1.gate_result',
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
agent: 'unit-test-case-writer',
|
|
43
|
+
step: 2,
|
|
44
|
+
inputArtifact: 'step-1-plan.md',
|
|
45
|
+
inputGate: 'step_1.gate_result',
|
|
46
|
+
outputArtifact: 'step-2-tests.md',
|
|
47
|
+
outputGate: 'step_2',
|
|
48
|
+
metadataField: 'step_2.gate_result',
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
agent: 'e2e-qa-planner',
|
|
52
|
+
step: 3,
|
|
53
|
+
inputArtifact: 'step-2-tests.md',
|
|
54
|
+
inputGate: 'step_2.gate_result',
|
|
55
|
+
outputArtifact: 'step-3-e2e-plan.md',
|
|
56
|
+
outputGate: 'step_3',
|
|
57
|
+
metadataField: 'step_3.gate_result',
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
agent: 'code-builder',
|
|
61
|
+
step: 4,
|
|
62
|
+
inputArtifact: 'step-3-e2e-plan.md',
|
|
63
|
+
inputGate: 'step_3.gate_result',
|
|
64
|
+
outputArtifact: 'step-4-implementation.md',
|
|
65
|
+
outputGate: 'step_4',
|
|
66
|
+
metadataField: 'step_4.gate_result',
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
agent: 'code-reviewer',
|
|
70
|
+
step: 5,
|
|
71
|
+
inputArtifact: 'step-4-implementation.md',
|
|
72
|
+
inputGate: 'step_4.gate_result',
|
|
73
|
+
outputArtifact: 'step-5-review.md',
|
|
74
|
+
outputGate: 'step_5',
|
|
75
|
+
metadataField: 'step_5.gate_result',
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
agent: 'security-agent',
|
|
79
|
+
step: '5d',
|
|
80
|
+
inputArtifact: 'step-5-review.md',
|
|
81
|
+
inputGate: 'step_5.status',
|
|
82
|
+
outputArtifact: 'step-5d-security.md',
|
|
83
|
+
outputGate: null,
|
|
84
|
+
metadataField: null,
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
agent: 'performance-agent',
|
|
88
|
+
step: '5e',
|
|
89
|
+
inputArtifact: 'step-5-review.md',
|
|
90
|
+
inputGate: 'step_5.status',
|
|
91
|
+
outputArtifact: 'step-5e-performance.md',
|
|
92
|
+
outputGate: null,
|
|
93
|
+
metadataField: null,
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
agent: 'delivery-architect',
|
|
97
|
+
step: 'independent',
|
|
98
|
+
inputArtifact: null,
|
|
99
|
+
inputGate: null,
|
|
100
|
+
outputArtifact: 'EXECUTION_PLAN.yml',
|
|
101
|
+
outputGate: null,
|
|
102
|
+
metadataField: null,
|
|
103
|
+
extraChecks: [
|
|
104
|
+
{ text: 'Checklist A', check: 'checklist_a', detail: 'has Checklist A (epic decomposition)' },
|
|
105
|
+
{ text: 'Checklist B', check: 'checklist_b', detail: 'has Checklist B (architecture validation)' },
|
|
106
|
+
{ text: 'test_strategy', check: 'test_strategy', detail: 'includes test_strategy in plan' },
|
|
107
|
+
],
|
|
108
|
+
},
|
|
109
|
+
];
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Validate agent prompt against its contract.
|
|
113
|
+
* @param {string} promptText — agent prompt content
|
|
114
|
+
* @param {object} contract — from PIPELINE_CONTRACTS
|
|
115
|
+
* @returns {{ agent, checks: Array<{ check, passed, detail }> }}
|
|
116
|
+
*/
|
|
117
|
+
function validateContract(promptText, contract) {
|
|
118
|
+
const checks = [];
|
|
119
|
+
|
|
120
|
+
// Check 1: Input artifact reference (skip for story-groomer — first in pipeline)
|
|
121
|
+
if (contract.inputArtifact) {
|
|
122
|
+
const found = promptText.includes(contract.inputArtifact);
|
|
123
|
+
checks.push({
|
|
124
|
+
check: 'input_artifact',
|
|
125
|
+
passed: found,
|
|
126
|
+
detail: found
|
|
127
|
+
? `references prior step "${contract.inputArtifact}"`
|
|
128
|
+
: `MISSING reference to prior step "${contract.inputArtifact}"`,
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Check 2: Input gate validation
|
|
133
|
+
if (contract.inputGate) {
|
|
134
|
+
const found = promptText.includes(contract.inputGate);
|
|
135
|
+
checks.push({
|
|
136
|
+
check: 'input_gate',
|
|
137
|
+
passed: found,
|
|
138
|
+
detail: found
|
|
139
|
+
? `validates "${contract.inputGate}" before proceeding`
|
|
140
|
+
: `MISSING validation of "${contract.inputGate}"`,
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Check 3: Output artifact reference
|
|
145
|
+
if (contract.outputArtifact) {
|
|
146
|
+
const found = promptText.includes(contract.outputArtifact);
|
|
147
|
+
checks.push({
|
|
148
|
+
check: 'output_artifact',
|
|
149
|
+
passed: found,
|
|
150
|
+
detail: found
|
|
151
|
+
? `produces "${contract.outputArtifact}"`
|
|
152
|
+
: `MISSING output artifact "${contract.outputArtifact}"`,
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Check 4: metadata.yml update
|
|
157
|
+
if (contract.metadataField) {
|
|
158
|
+
const found = promptText.includes('metadata.yml');
|
|
159
|
+
checks.push({
|
|
160
|
+
check: 'metadata_update',
|
|
161
|
+
passed: found,
|
|
162
|
+
detail: found
|
|
163
|
+
? `updates metadata.yml`
|
|
164
|
+
: `MISSING metadata.yml update`,
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Check 5: PIPELINE VALIDATION block
|
|
169
|
+
if (contract.inputArtifact) {
|
|
170
|
+
const found = promptText.includes('PIPELINE VALIDATION');
|
|
171
|
+
checks.push({
|
|
172
|
+
check: 'pipeline_validation_block',
|
|
173
|
+
passed: found,
|
|
174
|
+
detail: found
|
|
175
|
+
? `has PIPELINE VALIDATION block`
|
|
176
|
+
: `MISSING PIPELINE VALIDATION block`,
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Check 6: Extra checks (agent-specific)
|
|
181
|
+
if (contract.extraChecks) {
|
|
182
|
+
for (const ec of contract.extraChecks) {
|
|
183
|
+
const found = promptText.includes(ec.text);
|
|
184
|
+
checks.push({
|
|
185
|
+
check: ec.check,
|
|
186
|
+
passed: found,
|
|
187
|
+
detail: found ? ec.detail : `MISSING: ${ec.detail}`,
|
|
188
|
+
});
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
return {
|
|
193
|
+
agent: contract.agent,
|
|
194
|
+
step: contract.step,
|
|
195
|
+
checks,
|
|
196
|
+
passed: checks.every(c => c.passed),
|
|
197
|
+
failures: checks.filter(c => !c.passed),
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Run all contract validations.
|
|
203
|
+
* @param {object} agentPrompts — { agentName: promptText }
|
|
204
|
+
* @returns {{ total, passed, failed, results }}
|
|
205
|
+
*/
|
|
206
|
+
function validateAllContracts(agentPrompts) {
|
|
207
|
+
const results = [];
|
|
208
|
+
|
|
209
|
+
for (const contract of PIPELINE_CONTRACTS) {
|
|
210
|
+
const promptText = agentPrompts[contract.agent];
|
|
211
|
+
if (!promptText) {
|
|
212
|
+
results.push({
|
|
213
|
+
agent: contract.agent,
|
|
214
|
+
step: contract.step,
|
|
215
|
+
checks: [],
|
|
216
|
+
passed: false,
|
|
217
|
+
failures: [{ check: 'agent_exists', passed: false, detail: `agent "${contract.agent}" not found` }],
|
|
218
|
+
});
|
|
219
|
+
continue;
|
|
220
|
+
}
|
|
221
|
+
results.push(validateContract(promptText, contract));
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
return {
|
|
225
|
+
total: results.length,
|
|
226
|
+
passed: results.filter(r => r.passed).length,
|
|
227
|
+
failed: results.filter(r => !r.passed).length,
|
|
228
|
+
results,
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* Format contract results.
|
|
234
|
+
*/
|
|
235
|
+
function formatContractResults(results, format = 'pretty') {
|
|
236
|
+
if (format === 'json') return JSON.stringify(results, null, 2);
|
|
237
|
+
|
|
238
|
+
const lines = ['', 'Hone AI — Agent Contract Validation', '====================================', ''];
|
|
239
|
+
|
|
240
|
+
for (const r of results.results) {
|
|
241
|
+
const icon = r.passed ? 'PASS' : 'FAIL';
|
|
242
|
+
lines.push(`[${icon}] Step ${r.step}: ${r.agent} (${r.checks.length} checks)`);
|
|
243
|
+
for (const f of r.failures) {
|
|
244
|
+
lines.push(` x ${f.check}: ${f.detail}`);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
lines.push('');
|
|
249
|
+
lines.push('----------------------------------');
|
|
250
|
+
lines.push(`Summary: ${results.total} agents | ${results.passed} passed | ${results.failed} failed`);
|
|
251
|
+
lines.push('');
|
|
252
|
+
|
|
253
|
+
return lines.join('\n');
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
module.exports = { PIPELINE_CONTRACTS, validateContract, validateAllContracts, formatContractResults };
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* eval-llm-judge.js — HC-019i LLM-as-judge evaluator.
|
|
4
|
+
*
|
|
5
|
+
* Uses an LLM to assess agent prompt quality against criteria that
|
|
6
|
+
* deterministic graders can't check (semantic meaning, completeness,
|
|
7
|
+
* reasoning quality).
|
|
8
|
+
*
|
|
9
|
+
* Pure helper with injected LLM call function.
|
|
10
|
+
* Integrates with HC-019j three-valued outcomes for non-deterministic results.
|
|
11
|
+
*/
|
|
12
|
+
const { classify, wrapDeterministic } = require('./eval-three-valued');
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Judge criteria for LLM evaluation.
|
|
16
|
+
* Each criterion is a question the LLM answers YES/NO about the agent output.
|
|
17
|
+
*
|
|
18
|
+
* Eval scenario format for LLM-judge mode:
|
|
19
|
+
* ```yaml
|
|
20
|
+
* grading:
|
|
21
|
+
* mode: llm-judge
|
|
22
|
+
* criteria:
|
|
23
|
+
* - "Does the prompt clearly define the agent's role and responsibilities?"
|
|
24
|
+
* - "Does the prompt include error handling guidance?"
|
|
25
|
+
* - "Is the output format specification unambiguous?"
|
|
26
|
+
* runs: 3 # optional, default 1 for cost savings
|
|
27
|
+
* ```
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
const JUDGE_SYSTEM_PROMPT = `You are an eval judge for AI agent prompts. You will be given an agent prompt and a list of criteria. For each criterion, answer YES or NO with a brief explanation.
|
|
31
|
+
|
|
32
|
+
Rules:
|
|
33
|
+
- Answer ONLY YES or NO for each criterion, followed by a one-sentence explanation
|
|
34
|
+
- Be strict — if the criterion is not clearly met, answer NO
|
|
35
|
+
- Format each answer on its own line as: "CRITERION_N: YES|NO — explanation"
|
|
36
|
+
- Do not add commentary outside the criterion answers`;
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Build the judge prompt.
|
|
40
|
+
* @param {string} agentPrompt — the agent prompt text to evaluate
|
|
41
|
+
* @param {string[]} criteria — list of criteria to judge against
|
|
42
|
+
* @returns {string}
|
|
43
|
+
*/
|
|
44
|
+
function buildJudgePrompt(agentPrompt, criteria) {
|
|
45
|
+
const criteriaList = criteria
|
|
46
|
+
.map((c, i) => `CRITERION_${i + 1}: ${c}`)
|
|
47
|
+
.join('\n');
|
|
48
|
+
|
|
49
|
+
return `## Agent Prompt to Evaluate
|
|
50
|
+
|
|
51
|
+
${agentPrompt.slice(0, 8000)}
|
|
52
|
+
|
|
53
|
+
## Criteria to Judge
|
|
54
|
+
|
|
55
|
+
${criteriaList}
|
|
56
|
+
|
|
57
|
+
## Your Judgement
|
|
58
|
+
|
|
59
|
+
For each criterion, answer YES or NO with a brief explanation:`;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Parse the LLM judge response into structured results.
|
|
64
|
+
* @param {string} response — LLM output
|
|
65
|
+
* @param {number} criteriaCount — expected number of criteria
|
|
66
|
+
* @returns {Array<{ criterion: number, passed: boolean, explanation: string }>}
|
|
67
|
+
*/
|
|
68
|
+
function parseJudgeResponse(response, criteriaCount) {
|
|
69
|
+
const results = [];
|
|
70
|
+
|
|
71
|
+
for (let i = 1; i <= criteriaCount; i++) {
|
|
72
|
+
const pattern = new RegExp(`CRITERION_${i}:\\s*(YES|NO)\\s*[-—]\\s*(.+)`, 'i');
|
|
73
|
+
const match = response.match(pattern);
|
|
74
|
+
|
|
75
|
+
if (match) {
|
|
76
|
+
results.push({
|
|
77
|
+
criterion: i,
|
|
78
|
+
passed: match[1].toUpperCase() === 'YES',
|
|
79
|
+
explanation: match[2].trim(),
|
|
80
|
+
});
|
|
81
|
+
} else {
|
|
82
|
+
// Try looser pattern
|
|
83
|
+
const loosePattern = new RegExp(`(?:CRITERION_${i}|#${i}|${i}\\.)\\s*:?\\s*(YES|NO)`, 'i');
|
|
84
|
+
const looseMatch = response.match(loosePattern);
|
|
85
|
+
results.push({
|
|
86
|
+
criterion: i,
|
|
87
|
+
passed: looseMatch ? looseMatch[1].toUpperCase() === 'YES' : false,
|
|
88
|
+
explanation: looseMatch ? 'parsed from loose format' : 'could not parse response',
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return results;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Run a single LLM-judge evaluation.
|
|
98
|
+
* @param {object} opts
|
|
99
|
+
* @param {string} opts.agentPrompt — prompt text to evaluate
|
|
100
|
+
* @param {string[]} opts.criteria — list of criteria
|
|
101
|
+
* @param {(systemPrompt: string, userPrompt: string) => Promise<string>} opts.callLLM — injected LLM call
|
|
102
|
+
* @returns {Promise<{ passed: boolean, criteriaResults: Array, rawResponse: string }>}
|
|
103
|
+
*/
|
|
104
|
+
async function runJudge({ agentPrompt, criteria, callLLM }) {
|
|
105
|
+
const userPrompt = buildJudgePrompt(agentPrompt, criteria);
|
|
106
|
+
const response = await callLLM(JUDGE_SYSTEM_PROMPT, userPrompt);
|
|
107
|
+
const criteriaResults = parseJudgeResponse(response, criteria.length);
|
|
108
|
+
const allPassed = criteriaResults.every(r => r.passed);
|
|
109
|
+
|
|
110
|
+
return {
|
|
111
|
+
passed: allPassed,
|
|
112
|
+
criteriaResults,
|
|
113
|
+
rawResponse: response,
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Run LLM-judge evaluation with optional multiple runs for three-valued outcomes.
|
|
119
|
+
* @param {object} opts
|
|
120
|
+
* @param {object} opts.scenario — eval scenario with grading.mode = 'llm-judge'
|
|
121
|
+
* @param {string} opts.agentPrompt — prompt text
|
|
122
|
+
* @param {(systemPrompt: string, userPrompt: string) => Promise<string>} opts.callLLM
|
|
123
|
+
* @param {object} [opts.thresholds] — pass/fail thresholds for classify()
|
|
124
|
+
* @returns {Promise<object>} — scenario result with verdict + confidence
|
|
125
|
+
*/
|
|
126
|
+
async function runJudgeScenario({ scenario, agentPrompt, callLLM, thresholds }) {
|
|
127
|
+
const criteria = scenario.grading?.criteria || [];
|
|
128
|
+
const runs = scenario.grading?.runs || 1;
|
|
129
|
+
|
|
130
|
+
if (criteria.length === 0) {
|
|
131
|
+
return wrapDeterministic({
|
|
132
|
+
id: scenario.id,
|
|
133
|
+
agent: scenario.evalAgent || scenario.agent,
|
|
134
|
+
name: scenario.name || scenario.id,
|
|
135
|
+
result: 'error',
|
|
136
|
+
checks: 0,
|
|
137
|
+
checks_passed: 0,
|
|
138
|
+
failures: [{ type: 'config', passed: false, detail: 'no criteria defined for llm-judge' }],
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Single run — return deterministic result
|
|
143
|
+
if (runs <= 1) {
|
|
144
|
+
try {
|
|
145
|
+
const judgeResult = await runJudge({ agentPrompt, criteria, callLLM });
|
|
146
|
+
return wrapDeterministic({
|
|
147
|
+
id: scenario.id,
|
|
148
|
+
agent: scenario.evalAgent || scenario.agent,
|
|
149
|
+
name: scenario.name || scenario.id,
|
|
150
|
+
result: judgeResult.passed ? 'pass' : 'fail',
|
|
151
|
+
checks: criteria.length,
|
|
152
|
+
checks_passed: judgeResult.criteriaResults.filter(r => r.passed).length,
|
|
153
|
+
failures: judgeResult.criteriaResults
|
|
154
|
+
.filter(r => !r.passed)
|
|
155
|
+
.map(r => ({ type: `criterion_${r.criterion}`, passed: false, detail: r.explanation })),
|
|
156
|
+
judgeDetails: judgeResult.criteriaResults,
|
|
157
|
+
});
|
|
158
|
+
} catch (e) {
|
|
159
|
+
return wrapDeterministic({
|
|
160
|
+
id: scenario.id,
|
|
161
|
+
agent: scenario.evalAgent || scenario.agent,
|
|
162
|
+
name: scenario.name || scenario.id,
|
|
163
|
+
result: 'error',
|
|
164
|
+
checks: 0,
|
|
165
|
+
checks_passed: 0,
|
|
166
|
+
failures: [{ type: 'llm_error', passed: false, detail: e.message }],
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Multiple runs — use three-valued classification
|
|
172
|
+
const outcomes = [];
|
|
173
|
+
const allDetails = [];
|
|
174
|
+
|
|
175
|
+
for (let i = 0; i < runs; i++) {
|
|
176
|
+
try {
|
|
177
|
+
const judgeResult = await runJudge({ agentPrompt, criteria, callLLM });
|
|
178
|
+
outcomes.push(judgeResult.passed);
|
|
179
|
+
allDetails.push(judgeResult);
|
|
180
|
+
} catch (e) {
|
|
181
|
+
outcomes.push(false);
|
|
182
|
+
allDetails.push({ passed: false, error: e.message });
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const classification = classify(outcomes, thresholds);
|
|
187
|
+
|
|
188
|
+
return {
|
|
189
|
+
id: scenario.id,
|
|
190
|
+
agent: scenario.evalAgent || scenario.agent,
|
|
191
|
+
name: scenario.name || scenario.id,
|
|
192
|
+
result: classification.verdict,
|
|
193
|
+
verdict: classification.verdict,
|
|
194
|
+
confidence: classification.confidence,
|
|
195
|
+
deterministic: false,
|
|
196
|
+
checks: criteria.length,
|
|
197
|
+
checks_passed: classification.verdict === 'pass' ? criteria.length : 0,
|
|
198
|
+
runs_passed: classification.passed,
|
|
199
|
+
failures: classification.verdict === 'fail'
|
|
200
|
+
? [{ type: 'llm_judge', passed: false, detail: classification.details }]
|
|
201
|
+
: [],
|
|
202
|
+
runs: classification.runs,
|
|
203
|
+
runDetails: allDetails,
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
module.exports = {
|
|
208
|
+
JUDGE_SYSTEM_PROMPT,
|
|
209
|
+
buildJudgePrompt,
|
|
210
|
+
parseJudgeResponse,
|
|
211
|
+
runJudge,
|
|
212
|
+
runJudgeScenario,
|
|
213
|
+
};
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* eval-regression.js — HC-019h prompt regression testing.
|
|
4
|
+
*
|
|
5
|
+
* Saves eval results as a baseline snapshot, then detects drift
|
|
6
|
+
* when a prompt change causes a previously-passing eval to fail.
|
|
7
|
+
*
|
|
8
|
+
* Baseline file: evals/.baseline.json
|
|
9
|
+
* Pure helper with injected I/O.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
const BASELINE_FILENAME = '.baseline.json';
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Save current eval results as baseline.
|
|
16
|
+
* @param {object} results — from runAllScenarios
|
|
17
|
+
* @param {object} contractResults — from validateAllContracts
|
|
18
|
+
* @param {(path, content) => void} writeFile
|
|
19
|
+
* @param {string} evalDir
|
|
20
|
+
* @returns {{ saved: number }}
|
|
21
|
+
*/
|
|
22
|
+
function saveBaseline(results, contractResults, writeFile, evalDir) {
|
|
23
|
+
const baseline = {
|
|
24
|
+
created_at: new Date().toISOString(),
|
|
25
|
+
eval_scenarios: {},
|
|
26
|
+
contracts: {},
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
for (const s of results.scenarios) {
|
|
30
|
+
baseline.eval_scenarios[s.id] = {
|
|
31
|
+
agent: s.agent,
|
|
32
|
+
result: s.result,
|
|
33
|
+
checks: s.checks,
|
|
34
|
+
checks_passed: s.checks_passed,
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
for (const r of contractResults.results) {
|
|
39
|
+
baseline.contracts[r.agent] = {
|
|
40
|
+
step: r.step,
|
|
41
|
+
passed: r.passed,
|
|
42
|
+
checks: r.checks.length,
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
writeFile(`${evalDir}/${BASELINE_FILENAME}`, JSON.stringify(baseline, null, 2));
|
|
47
|
+
|
|
48
|
+
const totalEntries = Object.keys(baseline.eval_scenarios).length + Object.keys(baseline.contracts).length;
|
|
49
|
+
return { saved: totalEntries };
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Load baseline from file.
|
|
54
|
+
* @param {(path) => string} readFile
|
|
55
|
+
* @param {string} evalDir
|
|
56
|
+
* @returns {object|null}
|
|
57
|
+
*/
|
|
58
|
+
function loadBaseline(readFile, evalDir) {
|
|
59
|
+
try {
|
|
60
|
+
const content = readFile(`${evalDir}/${BASELINE_FILENAME}`);
|
|
61
|
+
return JSON.parse(content);
|
|
62
|
+
} catch {
|
|
63
|
+
return null;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Compare current results against baseline. Detect regressions.
|
|
69
|
+
* @param {object} currentResults — from runAllScenarios
|
|
70
|
+
* @param {object} currentContracts — from validateAllContracts
|
|
71
|
+
* @param {object} baseline — from loadBaseline
|
|
72
|
+
* @returns {{ regressions: Array, improvements: Array, new_scenarios: Array, summary }}
|
|
73
|
+
*/
|
|
74
|
+
function detectRegressions(currentResults, currentContracts, baseline) {
|
|
75
|
+
const regressions = [];
|
|
76
|
+
const improvements = [];
|
|
77
|
+
const newScenarios = [];
|
|
78
|
+
|
|
79
|
+
// Check eval scenarios
|
|
80
|
+
for (const s of currentResults.scenarios) {
|
|
81
|
+
const prev = baseline.eval_scenarios[s.id];
|
|
82
|
+
if (!prev) {
|
|
83
|
+
newScenarios.push({ type: 'scenario', id: s.id, agent: s.agent, result: s.result });
|
|
84
|
+
continue;
|
|
85
|
+
}
|
|
86
|
+
if (prev.result === 'pass' && s.result !== 'pass') {
|
|
87
|
+
regressions.push({
|
|
88
|
+
type: 'scenario',
|
|
89
|
+
id: s.id,
|
|
90
|
+
agent: s.agent,
|
|
91
|
+
was: 'pass',
|
|
92
|
+
now: s.result,
|
|
93
|
+
failures: s.failures.map(f => f.detail),
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
if (prev.result !== 'pass' && s.result === 'pass') {
|
|
97
|
+
improvements.push({ type: 'scenario', id: s.id, agent: s.agent, was: prev.result, now: 'pass' });
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Check contracts
|
|
102
|
+
for (const r of currentContracts.results) {
|
|
103
|
+
const prev = baseline.contracts[r.agent];
|
|
104
|
+
if (!prev) {
|
|
105
|
+
newScenarios.push({ type: 'contract', agent: r.agent, passed: r.passed });
|
|
106
|
+
continue;
|
|
107
|
+
}
|
|
108
|
+
if (prev.passed && !r.passed) {
|
|
109
|
+
regressions.push({
|
|
110
|
+
type: 'contract',
|
|
111
|
+
agent: r.agent,
|
|
112
|
+
was: 'pass',
|
|
113
|
+
now: 'fail',
|
|
114
|
+
failures: r.failures.map(f => f.detail),
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
if (!prev.passed && r.passed) {
|
|
118
|
+
improvements.push({ type: 'contract', agent: r.agent, was: 'fail', now: 'pass' });
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Check for removed scenarios (in baseline but not in current)
|
|
123
|
+
const removedScenarios = [];
|
|
124
|
+
const currentIds = new Set(currentResults.scenarios.map(s => s.id));
|
|
125
|
+
for (const [id, prev] of Object.entries(baseline.eval_scenarios)) {
|
|
126
|
+
if (!currentIds.has(id)) {
|
|
127
|
+
removedScenarios.push({ type: 'scenario', id, agent: prev.agent, was: prev.result });
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
const currentAgents = new Set(currentContracts.results.map(r => r.agent));
|
|
131
|
+
for (const [agent, prev] of Object.entries(baseline.contracts)) {
|
|
132
|
+
if (!currentAgents.has(agent)) {
|
|
133
|
+
removedScenarios.push({ type: 'contract', agent, was: prev.passed ? 'pass' : 'fail' });
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return {
|
|
138
|
+
regressions,
|
|
139
|
+
improvements,
|
|
140
|
+
new_scenarios: newScenarios,
|
|
141
|
+
removed: removedScenarios,
|
|
142
|
+
summary: {
|
|
143
|
+
regressions: regressions.length,
|
|
144
|
+
improvements: improvements.length,
|
|
145
|
+
new: newScenarios.length,
|
|
146
|
+
removed: removedScenarios.length,
|
|
147
|
+
baseline_date: baseline.created_at,
|
|
148
|
+
},
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Format regression results.
|
|
154
|
+
*/
|
|
155
|
+
function formatRegressionResults(results, format = 'pretty') {
|
|
156
|
+
if (format === 'json') return JSON.stringify(results, null, 2);
|
|
157
|
+
|
|
158
|
+
const lines = ['', 'Hone AI — Prompt Regression Check', '==================================', ''];
|
|
159
|
+
lines.push(`Baseline: ${results.summary.baseline_date}`);
|
|
160
|
+
lines.push('');
|
|
161
|
+
|
|
162
|
+
if (results.regressions.length > 0) {
|
|
163
|
+
lines.push('REGRESSIONS (previously passing, now failing):');
|
|
164
|
+
for (const r of results.regressions) {
|
|
165
|
+
lines.push(` [!!] ${r.type === 'scenario' ? r.id : r.agent}: ${r.was} → ${r.now}`);
|
|
166
|
+
for (const f of r.failures) {
|
|
167
|
+
lines.push(` ${f}`);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
lines.push('');
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
if (results.improvements.length > 0) {
|
|
174
|
+
lines.push('Improvements (previously failing, now passing):');
|
|
175
|
+
for (const i of results.improvements) {
|
|
176
|
+
lines.push(` [ok] ${i.type === 'scenario' ? i.id : i.agent}: ${i.was} → ${i.now}`);
|
|
177
|
+
}
|
|
178
|
+
lines.push('');
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
if (results.new_scenarios.length > 0) {
|
|
182
|
+
lines.push(`New (${results.new_scenarios.length} scenarios/contracts not in baseline — run --snapshot to update)`);
|
|
183
|
+
lines.push('');
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
if (results.removed && results.removed.length > 0) {
|
|
187
|
+
lines.push('Removed (in baseline but no longer exist — coverage gap):');
|
|
188
|
+
for (const r of results.removed) {
|
|
189
|
+
lines.push(` [??] ${r.type === 'scenario' ? r.id : r.agent}: was ${r.was}, now deleted`);
|
|
190
|
+
}
|
|
191
|
+
lines.push('');
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
lines.push('----------------------------------');
|
|
195
|
+
lines.push(`Summary: ${results.summary.regressions} regressions | ${results.summary.improvements} improvements | ${results.summary.new} new | ${results.summary.removed || 0} removed`);
|
|
196
|
+
lines.push(`Exit code: ${results.summary.regressions > 0 ? 1 : 0}`);
|
|
197
|
+
lines.push('');
|
|
198
|
+
|
|
199
|
+
return lines.join('\n');
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
module.exports = { saveBaseline, loadBaseline, detectRegressions, formatRegressionResults, BASELINE_FILENAME };
|
package/lib/eval-runner.js
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
* Pure helper with injected I/O (readFile, listDir).
|
|
9
9
|
*/
|
|
10
10
|
const { runCheck } = require('./eval-graders');
|
|
11
|
+
const { wrapDeterministic } = require('./eval-three-valued');
|
|
11
12
|
|
|
12
13
|
/**
|
|
13
14
|
* Load eval scenarios from the evals directory.
|
|
@@ -115,7 +116,7 @@ function runAllScenarios(scenarios, agentPrompts, opts = {}) {
|
|
|
115
116
|
const promptText = agentPrompts[agentName];
|
|
116
117
|
|
|
117
118
|
if (!promptText && !scenario.loadError) {
|
|
118
|
-
results.push({
|
|
119
|
+
results.push(wrapDeterministic({
|
|
119
120
|
id: scenario.id,
|
|
120
121
|
agent: agentName,
|
|
121
122
|
name: scenario.name || scenario.id,
|
|
@@ -123,12 +124,12 @@ function runAllScenarios(scenarios, agentPrompts, opts = {}) {
|
|
|
123
124
|
checks: 0,
|
|
124
125
|
checks_passed: 0,
|
|
125
126
|
failures: [{ type: 'missing_prompt', passed: false, detail: `agent "${agentName}" not found in AGENT_PROMPTS` }],
|
|
126
|
-
});
|
|
127
|
+
}));
|
|
127
128
|
continue;
|
|
128
129
|
}
|
|
129
130
|
|
|
130
131
|
const result = runScenario(scenario, promptText || '');
|
|
131
|
-
results.push(result);
|
|
132
|
+
results.push(wrapDeterministic(result));
|
|
132
133
|
|
|
133
134
|
if (opts.failFast && result.result !== 'pass') break;
|
|
134
135
|
}
|
|
@@ -163,8 +164,10 @@ function formatResults(results, format = 'pretty') {
|
|
|
163
164
|
for (const [agent, scenarios] of Object.entries(byAgent)) {
|
|
164
165
|
lines.push(`${agent} (${scenarios.length} scenarios)`);
|
|
165
166
|
for (const s of scenarios) {
|
|
166
|
-
const
|
|
167
|
-
|
|
167
|
+
const verdict = s.verdict || s.result;
|
|
168
|
+
const icon = verdict === 'pass' ? 'PASS' : verdict === 'fail' ? 'FAIL' : verdict === 'inconclusive' ? '????' : 'ERR ';
|
|
169
|
+
const conf = s.confidence != null && !s.deterministic ? ` ${s.confidence}%` : '';
|
|
170
|
+
lines.push(` [${icon}] ${s.id}: ${s.name} (${s.checks_passed}/${s.checks} checks${conf})`);
|
|
168
171
|
for (const f of s.failures) {
|
|
169
172
|
lines.push(` x ${f.type}: ${f.detail}`);
|
|
170
173
|
}
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* eval-three-valued.js — HC-019j three-valued test outcomes.
|
|
4
|
+
*
|
|
5
|
+
* Replaces binary pass/fail with Pass/Fail/Inconclusive for
|
|
6
|
+
* non-deterministic evaluations (LLM-as-judge, HC-019i).
|
|
7
|
+
*
|
|
8
|
+
* For deterministic checks (current graders), results are always
|
|
9
|
+
* definitive — Pass or Fail, never Inconclusive.
|
|
10
|
+
*
|
|
11
|
+
* For non-deterministic checks (future LLM-judge), the same eval
|
|
12
|
+
* is run N times and outcomes are classified statistically:
|
|
13
|
+
* - Pass: >= passThreshold of runs passed (default 80%)
|
|
14
|
+
* - Fail: >= failThreshold of runs failed (default 80%)
|
|
15
|
+
* - Inconclusive: neither threshold met (needs more runs or investigation)
|
|
16
|
+
*
|
|
17
|
+
* Based on AgentAssay (ICLR 2026) three-valued probabilistic outcomes.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Classify a set of run results into Pass/Fail/Inconclusive.
|
|
22
|
+
*
|
|
23
|
+
* @param {boolean[]} outcomes — array of pass/fail booleans from multiple runs
|
|
24
|
+
* @param {object} [opts]
|
|
25
|
+
* @param {number} [opts.passThreshold=0.8] — fraction of passes needed for Pass
|
|
26
|
+
* @param {number} [opts.failThreshold=0.8] — fraction of fails needed for Fail
|
|
27
|
+
* @param {number} [opts.minRuns=1] — minimum runs before classifying
|
|
28
|
+
* @returns {{ verdict: 'pass'|'fail'|'inconclusive', confidence: number, runs, passed, failed, details }}
|
|
29
|
+
*/
|
|
30
|
+
function classify(outcomes, opts = {}) {
|
|
31
|
+
const { passThreshold = 0.8, failThreshold = 0.8, minRuns = 1 } = opts;
|
|
32
|
+
|
|
33
|
+
if (!outcomes || outcomes.length === 0) {
|
|
34
|
+
return { verdict: 'inconclusive', confidence: 0, runs: 0, passed: 0, failed: 0, details: 'no runs' };
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const runs = outcomes.length;
|
|
38
|
+
const passed = outcomes.filter(o => o === true).length;
|
|
39
|
+
const failed = runs - passed;
|
|
40
|
+
const passRate = passed / runs;
|
|
41
|
+
const failRate = failed / runs;
|
|
42
|
+
|
|
43
|
+
if (runs < minRuns) {
|
|
44
|
+
return {
|
|
45
|
+
verdict: 'inconclusive',
|
|
46
|
+
confidence: Math.round(passRate * 100),
|
|
47
|
+
runs, passed, failed,
|
|
48
|
+
details: `insufficient runs (${runs}/${minRuns})`,
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Pass-priority: if both thresholds could be met (e.g., 1 run),
|
|
53
|
+
// pass wins. This is optimistic — we assume the agent is correct
|
|
54
|
+
// unless proven otherwise with enough evidence.
|
|
55
|
+
if (passRate >= passThreshold) {
|
|
56
|
+
return {
|
|
57
|
+
verdict: 'pass',
|
|
58
|
+
confidence: Math.round(passRate * 100),
|
|
59
|
+
runs, passed, failed,
|
|
60
|
+
details: `${passed}/${runs} passed (${Math.round(passRate * 100)}% >= ${Math.round(passThreshold * 100)}% threshold)`,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if (failRate >= failThreshold) {
|
|
65
|
+
return {
|
|
66
|
+
verdict: 'fail',
|
|
67
|
+
confidence: Math.round(failRate * 100),
|
|
68
|
+
runs, passed, failed,
|
|
69
|
+
details: `${failed}/${runs} failed (${Math.round(failRate * 100)}% >= ${Math.round(failThreshold * 100)}% threshold)`,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
return {
|
|
74
|
+
verdict: 'inconclusive',
|
|
75
|
+
confidence: Math.round(Math.max(passRate, failRate) * 100),
|
|
76
|
+
runs, passed, failed,
|
|
77
|
+
details: `neither threshold met: ${Math.round(passRate * 100)}% pass, ${Math.round(failRate * 100)}% fail`,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Compute Wilson score confidence interval for a pass rate.
|
|
83
|
+
* Used to determine if more runs would change the verdict.
|
|
84
|
+
*
|
|
85
|
+
* @param {number} passed — number of passes
|
|
86
|
+
* @param {number} total — total runs
|
|
87
|
+
* @param {number} [z=1.96] — z-score for confidence level (1.96 = 95%)
|
|
88
|
+
* @returns {{ lower: number, upper: number, center: number }}
|
|
89
|
+
*/
|
|
90
|
+
function wilsonInterval(passed, total, z = 1.96) {
|
|
91
|
+
if (total === 0) return { lower: 0, upper: 1, center: 0.5 };
|
|
92
|
+
|
|
93
|
+
const p = passed / total;
|
|
94
|
+
const denominator = 1 + z * z / total;
|
|
95
|
+
const center = (p + z * z / (2 * total)) / denominator;
|
|
96
|
+
const margin = (z * Math.sqrt((p * (1 - p) + z * z / (4 * total)) / total)) / denominator;
|
|
97
|
+
|
|
98
|
+
return {
|
|
99
|
+
lower: Math.max(0, Math.round((center - margin) * 1000) / 1000),
|
|
100
|
+
upper: Math.min(1, Math.round((center + margin) * 1000) / 1000),
|
|
101
|
+
center: Math.round(center * 1000) / 1000,
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Recommend whether more runs would help resolve an inconclusive result.
|
|
107
|
+
*
|
|
108
|
+
* @param {object} result — from classify()
|
|
109
|
+
* @param {object} [opts]
|
|
110
|
+
* @param {number} [opts.maxRuns=10] — maximum recommended additional runs
|
|
111
|
+
* @returns {{ recommend: boolean, additionalRuns: number, reason: string }}
|
|
112
|
+
*/
|
|
113
|
+
function recommendMoreRuns(result, opts = {}) {
|
|
114
|
+
const { maxRuns = 10 } = opts;
|
|
115
|
+
|
|
116
|
+
if (result.verdict !== 'inconclusive') {
|
|
117
|
+
return { recommend: false, additionalRuns: 0, reason: 'verdict is definitive' };
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
if (result.runs === 0) {
|
|
121
|
+
return { recommend: true, additionalRuns: 3, reason: 'no runs yet' };
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const interval = wilsonInterval(result.passed, result.runs);
|
|
125
|
+
const spread = interval.upper - interval.lower;
|
|
126
|
+
|
|
127
|
+
// If spread is wide, more runs would help narrow it
|
|
128
|
+
if (spread > 0.3 && result.runs < maxRuns) {
|
|
129
|
+
const additional = Math.min(maxRuns - result.runs, Math.ceil(result.runs * 0.5) + 2);
|
|
130
|
+
return { recommend: true, additionalRuns: additional, reason: `wide confidence interval (${interval.lower}-${interval.upper})` };
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// If spread is narrow but still inconclusive, the result is genuinely borderline
|
|
134
|
+
return { recommend: false, additionalRuns: 0, reason: `borderline result (${interval.lower}-${interval.upper}), more runs unlikely to resolve` };
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Wrap a deterministic eval result as a three-valued outcome.
|
|
139
|
+
* Deterministic results are always definitive (never inconclusive).
|
|
140
|
+
*
|
|
141
|
+
* @param {object} scenarioResult — from runScenario()
|
|
142
|
+
* @returns {object} — same shape with verdict + confidence added
|
|
143
|
+
*/
|
|
144
|
+
function wrapDeterministic(scenarioResult) {
|
|
145
|
+
let verdict;
|
|
146
|
+
if (scenarioResult.result === 'pass') verdict = 'pass';
|
|
147
|
+
else if (scenarioResult.result === 'fail') verdict = 'fail';
|
|
148
|
+
else verdict = 'error'; // error means the eval itself broke, not flaky — distinct from inconclusive
|
|
149
|
+
|
|
150
|
+
return {
|
|
151
|
+
...scenarioResult,
|
|
152
|
+
verdict,
|
|
153
|
+
confidence: verdict === 'error' ? 0 : 100,
|
|
154
|
+
deterministic: true,
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
module.exports = { classify, wilsonInterval, recommendMoreRuns, wrapDeterministic };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@hone-ai/cli",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.7.0",
|
|
4
4
|
"description": "Hone AI — Enterprise SDLC Pipeline CLI",
|
|
5
5
|
"main": "hone-cli.js",
|
|
6
6
|
"bin": {
|
|
@@ -14,7 +14,8 @@
|
|
|
14
14
|
],
|
|
15
15
|
"scripts": {
|
|
16
16
|
"test": "echo \"No tests yet\" && exit 0",
|
|
17
|
-
"link": "npm link"
|
|
17
|
+
"link": "npm link",
|
|
18
|
+
"postinstall": "echo '\\n Hone AI CLI installed successfully.\\n Next: run `hone init --token <YOUR_TOKEN>` to configure.\\n Docs: https://github.com/subbareddyvani/hone-server\\n'"
|
|
18
19
|
},
|
|
19
20
|
"dependencies": {
|
|
20
21
|
"ajv": "^8.20.0",
|