@hone-ai/cli 1.6.0 → 1.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/hone-cli.js +4 -3
- package/lib/auto-detect.js +47 -12
- package/lib/eval-ab-testing.js +113 -0
- package/lib/eval-llm-judge.js +213 -0
- package/lib/eval-runner.js +8 -5
- package/lib/eval-three-valued.js +158 -0
- package/package.json +3 -2
- package/schema/metadata.schema.json +134 -0
package/hone-cli.js
CHANGED
|
@@ -1274,7 +1274,8 @@ program
|
|
|
1274
1274
|
|
|
1275
1275
|
try {
|
|
1276
1276
|
const health = await axios.get(`${config.apiUrl}/health`);
|
|
1277
|
-
|
|
1277
|
+
const ver = health.data.version ? ` (v${health.data.version})` : '';
|
|
1278
|
+
console.log(`✓ API health: ${health.data.status}${ver}`);
|
|
1278
1279
|
} catch (e) {
|
|
1279
1280
|
console.error(`✗ API health: ${e.message}`);
|
|
1280
1281
|
}
|
|
@@ -3927,12 +3928,12 @@ program
|
|
|
3927
3928
|
.description('Validate .github/pipeline/<STORY-ID>/metadata.yml against the framework JSON schema. Implements SC-010 §10 (metadata.yml as wire protocol).')
|
|
3928
3929
|
.option('--all', 'validate every metadata.yml in .github/pipeline/')
|
|
3929
3930
|
.option('--repo-root <path>', 'repo root (default: process.cwd())')
|
|
3930
|
-
.option('--schema <path>', 'override schema path (default:
|
|
3931
|
+
.option('--schema <path>', 'override schema path (default: bundled metadata.schema.json)')
|
|
3931
3932
|
.option('--json', 'emit findings as JSON')
|
|
3932
3933
|
.action((storyId, opts) => {
|
|
3933
3934
|
const { validateMetadata, validateAllMetadata } = require('./lib/validate-metadata');
|
|
3934
3935
|
const repoRoot = opts.repoRoot || process.cwd();
|
|
3935
|
-
const schemaPath = opts.schema || require('node:path').join(
|
|
3936
|
+
const schemaPath = opts.schema || require('node:path').join(__dirname, 'schema', 'metadata.schema.json');
|
|
3936
3937
|
|
|
3937
3938
|
if (opts.all) {
|
|
3938
3939
|
const result = validateAllMetadata({ repoRoot, schemaPath });
|
package/lib/auto-detect.js
CHANGED
|
@@ -174,23 +174,58 @@ function detectE2EConvention(signals) {
|
|
|
174
174
|
* we recommend. Returns either status:'ok' or status:'drift' with an
|
|
175
175
|
* actionable suggested fix.
|
|
176
176
|
*/
|
|
177
|
+
/**
|
|
178
|
+
* Check if configuredPattern is a superset of recommendedPattern.
|
|
179
|
+
* A broader configured pattern is intentional (not drift).
|
|
180
|
+
* Only flag when configured is NARROWER than recommended.
|
|
181
|
+
*/
|
|
182
|
+
function isPatternBroader(configured, recommended) {
|
|
183
|
+
try {
|
|
184
|
+
const recRe = new RegExp(recommended);
|
|
185
|
+
const cfgRe = new RegExp(configured);
|
|
186
|
+
// Test a set of sample strings that the recommended pattern matches
|
|
187
|
+
// If configured matches all of them, it's at least as broad
|
|
188
|
+
const samples = [];
|
|
189
|
+
// Generate simple test strings from recommended pattern components
|
|
190
|
+
const recStr = recommended.replace(/\\/g, '');
|
|
191
|
+
if (recStr.includes('|')) {
|
|
192
|
+
// Pattern has alternations — extract them
|
|
193
|
+
const alts = recommended.split('|').map(a => a.replace(/[()^$]/g, ''));
|
|
194
|
+
for (const alt of alts) {
|
|
195
|
+
// Generate a plausible match for each alternative
|
|
196
|
+
const sample = alt.replace(/\[0-9\]\+/g, '123').replace(/\[A-Z\]\+?/g, 'A').replace(/\[A-Za-z0-9\]\+/g, 'abc');
|
|
197
|
+
samples.push(sample);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
// If configured matches everything recommended matches, it's broader or equal
|
|
201
|
+
return samples.length > 0 && samples.every(s => cfgRe.test(s));
|
|
202
|
+
} catch {
|
|
203
|
+
return false;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
177
207
|
function checkPatternDrift({ configured, recommended }) {
|
|
178
208
|
const findings = [];
|
|
179
209
|
if (configured?.story_id_pattern && configured.story_id_pattern !== recommended.story.pattern) {
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
210
|
+
// Only flag if configured is NARROWER — broader is intentional
|
|
211
|
+
if (!isPatternBroader(configured.story_id_pattern, recommended.story.pattern)) {
|
|
212
|
+
findings.push({
|
|
213
|
+
key: 'story_id_pattern',
|
|
214
|
+
configured: configured.story_id_pattern,
|
|
215
|
+
recommended: recommended.story.pattern,
|
|
216
|
+
reason: `repo looks like ${recommended.story.shape} (${recommended.story.confidence} confidence)`,
|
|
217
|
+
});
|
|
218
|
+
}
|
|
186
219
|
}
|
|
187
220
|
if (configured?.e2e_spec_pattern && configured.e2e_spec_pattern !== recommended.e2e.pattern) {
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
221
|
+
if (!isPatternBroader(configured.e2e_spec_pattern, recommended.e2e.pattern)) {
|
|
222
|
+
findings.push({
|
|
223
|
+
key: 'e2e_spec_pattern',
|
|
224
|
+
configured: configured.e2e_spec_pattern,
|
|
225
|
+
recommended: recommended.e2e.pattern,
|
|
226
|
+
reason: `detected ${recommended.e2e.framework} under ${recommended.e2e.dir}/e2e/ (${recommended.e2e.confidence} confidence)`,
|
|
227
|
+
});
|
|
228
|
+
}
|
|
194
229
|
}
|
|
195
230
|
|
|
196
231
|
if (findings.length === 0) {
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* eval-ab-testing.js — HC-019k Agent A/B testing.
|
|
4
|
+
*
|
|
5
|
+
* Compare two prompt variants using the same eval scenarios.
|
|
6
|
+
* Reports which variant performs better across all checks.
|
|
7
|
+
*
|
|
8
|
+
* Usage: provide two versions of an agent prompt (A and B),
|
|
9
|
+
* run the same deterministic evals against both, compare results.
|
|
10
|
+
*
|
|
11
|
+
* Pure helper — no I/O, no LLM calls.
|
|
12
|
+
*/
|
|
13
|
+
const { runScenario } = require('./eval-runner');
|
|
14
|
+
const { wrapDeterministic } = require('./eval-three-valued');
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Run A/B comparison for a set of scenarios against two prompt variants.
|
|
18
|
+
*
|
|
19
|
+
* @param {Array<object>} scenarios — eval scenarios (filtered for one agent)
|
|
20
|
+
* @param {string} promptA — current prompt text (control)
|
|
21
|
+
* @param {string} promptB — new prompt text (variant)
|
|
22
|
+
* @param {object} [opts]
|
|
23
|
+
* @param {string} [opts.labelA='A (current)']
|
|
24
|
+
* @param {string} [opts.labelB='B (variant)']
|
|
25
|
+
* @returns {{ agent, labelA, labelB, scenarios: Array, summary }}
|
|
26
|
+
*/
|
|
27
|
+
function comparePrompts(scenarios, promptA, promptB, opts = {}) {
|
|
28
|
+
const { labelA = 'A (current)', labelB = 'B (variant)' } = opts;
|
|
29
|
+
const results = [];
|
|
30
|
+
|
|
31
|
+
for (const scenario of scenarios) {
|
|
32
|
+
const resultA = wrapDeterministic(runScenario(scenario, promptA));
|
|
33
|
+
const resultB = wrapDeterministic(runScenario(scenario, promptB));
|
|
34
|
+
|
|
35
|
+
const winner =
|
|
36
|
+
resultA.verdict === 'pass' && resultB.verdict !== 'pass' ? 'A' :
|
|
37
|
+
resultB.verdict === 'pass' && resultA.verdict !== 'pass' ? 'B' :
|
|
38
|
+
resultA.checks_passed > resultB.checks_passed ? 'A' :
|
|
39
|
+
resultB.checks_passed > resultA.checks_passed ? 'B' :
|
|
40
|
+
'tie';
|
|
41
|
+
|
|
42
|
+
results.push({
|
|
43
|
+
id: scenario.id,
|
|
44
|
+
name: scenario.name || scenario.id,
|
|
45
|
+
a: { verdict: resultA.verdict, checks_passed: resultA.checks_passed, checks: resultA.checks },
|
|
46
|
+
b: { verdict: resultB.verdict, checks_passed: resultB.checks_passed, checks: resultB.checks },
|
|
47
|
+
winner,
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const aWins = results.filter(r => r.winner === 'A').length;
|
|
52
|
+
const bWins = results.filter(r => r.winner === 'B').length;
|
|
53
|
+
const ties = results.filter(r => r.winner === 'tie').length;
|
|
54
|
+
|
|
55
|
+
const aTotal = results.reduce((s, r) => s + r.a.checks_passed, 0);
|
|
56
|
+
const bTotal = results.reduce((s, r) => s + r.b.checks_passed, 0);
|
|
57
|
+
const maxChecks = results.reduce((s, r) => s + r.a.checks, 0);
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
agent: scenarios[0]?.evalAgent || scenarios[0]?.agent || 'unknown',
|
|
61
|
+
labelA,
|
|
62
|
+
labelB,
|
|
63
|
+
scenarios: results,
|
|
64
|
+
summary: {
|
|
65
|
+
total: results.length,
|
|
66
|
+
a_wins: aWins,
|
|
67
|
+
b_wins: bWins,
|
|
68
|
+
ties,
|
|
69
|
+
a_score: maxChecks > 0 ? Math.round((aTotal / maxChecks) * 100) : 0,
|
|
70
|
+
b_score: maxChecks > 0 ? Math.round((bTotal / maxChecks) * 100) : 0,
|
|
71
|
+
recommendation: aWins > bWins ? 'keep_a' : bWins > aWins ? 'use_b' : 'no_difference',
|
|
72
|
+
},
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Format A/B comparison results.
|
|
78
|
+
*/
|
|
79
|
+
function formatComparison(result, format = 'pretty') {
|
|
80
|
+
if (format === 'json') return JSON.stringify(result, null, 2);
|
|
81
|
+
|
|
82
|
+
const lines = ['', 'Hone AI — A/B Prompt Comparison', '================================', ''];
|
|
83
|
+
lines.push(`Agent: ${result.agent}`);
|
|
84
|
+
lines.push(`Variant A: ${result.labelA}`);
|
|
85
|
+
lines.push(`Variant B: ${result.labelB}`);
|
|
86
|
+
lines.push('');
|
|
87
|
+
|
|
88
|
+
lines.push(' Scenario A B Winner');
|
|
89
|
+
lines.push(' -------- - - ------');
|
|
90
|
+
for (const s of result.scenarios) {
|
|
91
|
+
const name = (s.name || s.id).padEnd(32).slice(0, 32);
|
|
92
|
+
const a = `${s.a.checks_passed}/${s.a.checks}`.padStart(6);
|
|
93
|
+
const b = `${s.b.checks_passed}/${s.b.checks}`.padStart(6);
|
|
94
|
+
const winner = s.winner === 'tie' ? ' tie' : s.winner === 'A' ? ' <-A' : ' B->';
|
|
95
|
+
lines.push(` ${name} ${a} ${b} ${winner}`);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
lines.push('');
|
|
99
|
+
lines.push('----------------------------------');
|
|
100
|
+
lines.push(`Score: A=${result.summary.a_score}% | B=${result.summary.b_score}%`);
|
|
101
|
+
lines.push(`Wins: A=${result.summary.a_wins} | B=${result.summary.b_wins} | Ties=${result.summary.ties}`);
|
|
102
|
+
|
|
103
|
+
const rec = result.summary.recommendation;
|
|
104
|
+
const msg = rec === 'keep_a' ? 'Keep current prompt (A wins)' :
|
|
105
|
+
rec === 'use_b' ? 'Switch to variant B (B wins)' :
|
|
106
|
+
'No significant difference';
|
|
107
|
+
lines.push(`Recommendation: ${msg}`);
|
|
108
|
+
lines.push('');
|
|
109
|
+
|
|
110
|
+
return lines.join('\n');
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
module.exports = { comparePrompts, formatComparison };
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* eval-llm-judge.js — HC-019i LLM-as-judge evaluator.
|
|
4
|
+
*
|
|
5
|
+
* Uses an LLM to assess agent prompt quality against criteria that
|
|
6
|
+
* deterministic graders can't check (semantic meaning, completeness,
|
|
7
|
+
* reasoning quality).
|
|
8
|
+
*
|
|
9
|
+
* Pure helper with injected LLM call function.
|
|
10
|
+
* Integrates with HC-019j three-valued outcomes for non-deterministic results.
|
|
11
|
+
*/
|
|
12
|
+
const { classify, wrapDeterministic } = require('./eval-three-valued');
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Judge criteria for LLM evaluation.
|
|
16
|
+
* Each criterion is a question the LLM answers YES/NO about the agent output.
|
|
17
|
+
*
|
|
18
|
+
* Eval scenario format for LLM-judge mode:
|
|
19
|
+
* ```yaml
|
|
20
|
+
* grading:
|
|
21
|
+
* mode: llm-judge
|
|
22
|
+
* criteria:
|
|
23
|
+
* - "Does the prompt clearly define the agent's role and responsibilities?"
|
|
24
|
+
* - "Does the prompt include error handling guidance?"
|
|
25
|
+
* - "Is the output format specification unambiguous?"
|
|
26
|
+
* runs: 3 # optional, default 1 for cost savings
|
|
27
|
+
* ```
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
const JUDGE_SYSTEM_PROMPT = `You are an eval judge for AI agent prompts. You will be given an agent prompt and a list of criteria. For each criterion, answer YES or NO with a brief explanation.
|
|
31
|
+
|
|
32
|
+
Rules:
|
|
33
|
+
- Answer ONLY YES or NO for each criterion, followed by a one-sentence explanation
|
|
34
|
+
- Be strict — if the criterion is not clearly met, answer NO
|
|
35
|
+
- Format each answer on its own line as: "CRITERION_N: YES|NO — explanation"
|
|
36
|
+
- Do not add commentary outside the criterion answers`;
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Build the judge prompt.
|
|
40
|
+
* @param {string} agentPrompt — the agent prompt text to evaluate
|
|
41
|
+
* @param {string[]} criteria — list of criteria to judge against
|
|
42
|
+
* @returns {string}
|
|
43
|
+
*/
|
|
44
|
+
function buildJudgePrompt(agentPrompt, criteria) {
|
|
45
|
+
const criteriaList = criteria
|
|
46
|
+
.map((c, i) => `CRITERION_${i + 1}: ${c}`)
|
|
47
|
+
.join('\n');
|
|
48
|
+
|
|
49
|
+
return `## Agent Prompt to Evaluate
|
|
50
|
+
|
|
51
|
+
${agentPrompt.slice(0, 8000)}
|
|
52
|
+
|
|
53
|
+
## Criteria to Judge
|
|
54
|
+
|
|
55
|
+
${criteriaList}
|
|
56
|
+
|
|
57
|
+
## Your Judgement
|
|
58
|
+
|
|
59
|
+
For each criterion, answer YES or NO with a brief explanation:`;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Parse the LLM judge response into structured results.
|
|
64
|
+
* @param {string} response — LLM output
|
|
65
|
+
* @param {number} criteriaCount — expected number of criteria
|
|
66
|
+
* @returns {Array<{ criterion: number, passed: boolean, explanation: string }>}
|
|
67
|
+
*/
|
|
68
|
+
function parseJudgeResponse(response, criteriaCount) {
|
|
69
|
+
const results = [];
|
|
70
|
+
|
|
71
|
+
for (let i = 1; i <= criteriaCount; i++) {
|
|
72
|
+
const pattern = new RegExp(`CRITERION_${i}:\\s*(YES|NO)\\s*[-—]\\s*(.+)`, 'i');
|
|
73
|
+
const match = response.match(pattern);
|
|
74
|
+
|
|
75
|
+
if (match) {
|
|
76
|
+
results.push({
|
|
77
|
+
criterion: i,
|
|
78
|
+
passed: match[1].toUpperCase() === 'YES',
|
|
79
|
+
explanation: match[2].trim(),
|
|
80
|
+
});
|
|
81
|
+
} else {
|
|
82
|
+
// Try looser pattern
|
|
83
|
+
const loosePattern = new RegExp(`(?:CRITERION_${i}|#${i}|${i}\\.)\\s*:?\\s*(YES|NO)`, 'i');
|
|
84
|
+
const looseMatch = response.match(loosePattern);
|
|
85
|
+
results.push({
|
|
86
|
+
criterion: i,
|
|
87
|
+
passed: looseMatch ? looseMatch[1].toUpperCase() === 'YES' : false,
|
|
88
|
+
explanation: looseMatch ? 'parsed from loose format' : 'could not parse response',
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return results;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Run a single LLM-judge evaluation.
|
|
98
|
+
* @param {object} opts
|
|
99
|
+
* @param {string} opts.agentPrompt — prompt text to evaluate
|
|
100
|
+
* @param {string[]} opts.criteria — list of criteria
|
|
101
|
+
* @param {(systemPrompt: string, userPrompt: string) => Promise<string>} opts.callLLM — injected LLM call
|
|
102
|
+
* @returns {Promise<{ passed: boolean, criteriaResults: Array, rawResponse: string }>}
|
|
103
|
+
*/
|
|
104
|
+
async function runJudge({ agentPrompt, criteria, callLLM }) {
|
|
105
|
+
const userPrompt = buildJudgePrompt(agentPrompt, criteria);
|
|
106
|
+
const response = await callLLM(JUDGE_SYSTEM_PROMPT, userPrompt);
|
|
107
|
+
const criteriaResults = parseJudgeResponse(response, criteria.length);
|
|
108
|
+
const allPassed = criteriaResults.every(r => r.passed);
|
|
109
|
+
|
|
110
|
+
return {
|
|
111
|
+
passed: allPassed,
|
|
112
|
+
criteriaResults,
|
|
113
|
+
rawResponse: response,
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Run LLM-judge evaluation with optional multiple runs for three-valued outcomes.
|
|
119
|
+
* @param {object} opts
|
|
120
|
+
* @param {object} opts.scenario — eval scenario with grading.mode = 'llm-judge'
|
|
121
|
+
* @param {string} opts.agentPrompt — prompt text
|
|
122
|
+
* @param {(systemPrompt: string, userPrompt: string) => Promise<string>} opts.callLLM
|
|
123
|
+
* @param {object} [opts.thresholds] — pass/fail thresholds for classify()
|
|
124
|
+
* @returns {Promise<object>} — scenario result with verdict + confidence
|
|
125
|
+
*/
|
|
126
|
+
async function runJudgeScenario({ scenario, agentPrompt, callLLM, thresholds }) {
|
|
127
|
+
const criteria = scenario.grading?.criteria || [];
|
|
128
|
+
const runs = scenario.grading?.runs || 1;
|
|
129
|
+
|
|
130
|
+
if (criteria.length === 0) {
|
|
131
|
+
return wrapDeterministic({
|
|
132
|
+
id: scenario.id,
|
|
133
|
+
agent: scenario.evalAgent || scenario.agent,
|
|
134
|
+
name: scenario.name || scenario.id,
|
|
135
|
+
result: 'error',
|
|
136
|
+
checks: 0,
|
|
137
|
+
checks_passed: 0,
|
|
138
|
+
failures: [{ type: 'config', passed: false, detail: 'no criteria defined for llm-judge' }],
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Single run — return deterministic result
|
|
143
|
+
if (runs <= 1) {
|
|
144
|
+
try {
|
|
145
|
+
const judgeResult = await runJudge({ agentPrompt, criteria, callLLM });
|
|
146
|
+
return wrapDeterministic({
|
|
147
|
+
id: scenario.id,
|
|
148
|
+
agent: scenario.evalAgent || scenario.agent,
|
|
149
|
+
name: scenario.name || scenario.id,
|
|
150
|
+
result: judgeResult.passed ? 'pass' : 'fail',
|
|
151
|
+
checks: criteria.length,
|
|
152
|
+
checks_passed: judgeResult.criteriaResults.filter(r => r.passed).length,
|
|
153
|
+
failures: judgeResult.criteriaResults
|
|
154
|
+
.filter(r => !r.passed)
|
|
155
|
+
.map(r => ({ type: `criterion_${r.criterion}`, passed: false, detail: r.explanation })),
|
|
156
|
+
judgeDetails: judgeResult.criteriaResults,
|
|
157
|
+
});
|
|
158
|
+
} catch (e) {
|
|
159
|
+
return wrapDeterministic({
|
|
160
|
+
id: scenario.id,
|
|
161
|
+
agent: scenario.evalAgent || scenario.agent,
|
|
162
|
+
name: scenario.name || scenario.id,
|
|
163
|
+
result: 'error',
|
|
164
|
+
checks: 0,
|
|
165
|
+
checks_passed: 0,
|
|
166
|
+
failures: [{ type: 'llm_error', passed: false, detail: e.message }],
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Multiple runs — use three-valued classification
|
|
172
|
+
const outcomes = [];
|
|
173
|
+
const allDetails = [];
|
|
174
|
+
|
|
175
|
+
for (let i = 0; i < runs; i++) {
|
|
176
|
+
try {
|
|
177
|
+
const judgeResult = await runJudge({ agentPrompt, criteria, callLLM });
|
|
178
|
+
outcomes.push(judgeResult.passed);
|
|
179
|
+
allDetails.push(judgeResult);
|
|
180
|
+
} catch (e) {
|
|
181
|
+
outcomes.push(false);
|
|
182
|
+
allDetails.push({ passed: false, error: e.message });
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const classification = classify(outcomes, thresholds);
|
|
187
|
+
|
|
188
|
+
return {
|
|
189
|
+
id: scenario.id,
|
|
190
|
+
agent: scenario.evalAgent || scenario.agent,
|
|
191
|
+
name: scenario.name || scenario.id,
|
|
192
|
+
result: classification.verdict,
|
|
193
|
+
verdict: classification.verdict,
|
|
194
|
+
confidence: classification.confidence,
|
|
195
|
+
deterministic: false,
|
|
196
|
+
checks: criteria.length,
|
|
197
|
+
checks_passed: classification.verdict === 'pass' ? criteria.length : 0,
|
|
198
|
+
runs_passed: classification.passed,
|
|
199
|
+
failures: classification.verdict === 'fail'
|
|
200
|
+
? [{ type: 'llm_judge', passed: false, detail: classification.details }]
|
|
201
|
+
: [],
|
|
202
|
+
runs: classification.runs,
|
|
203
|
+
runDetails: allDetails,
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
module.exports = {
|
|
208
|
+
JUDGE_SYSTEM_PROMPT,
|
|
209
|
+
buildJudgePrompt,
|
|
210
|
+
parseJudgeResponse,
|
|
211
|
+
runJudge,
|
|
212
|
+
runJudgeScenario,
|
|
213
|
+
};
|
package/lib/eval-runner.js
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
* Pure helper with injected I/O (readFile, listDir).
|
|
9
9
|
*/
|
|
10
10
|
const { runCheck } = require('./eval-graders');
|
|
11
|
+
const { wrapDeterministic } = require('./eval-three-valued');
|
|
11
12
|
|
|
12
13
|
/**
|
|
13
14
|
* Load eval scenarios from the evals directory.
|
|
@@ -115,7 +116,7 @@ function runAllScenarios(scenarios, agentPrompts, opts = {}) {
|
|
|
115
116
|
const promptText = agentPrompts[agentName];
|
|
116
117
|
|
|
117
118
|
if (!promptText && !scenario.loadError) {
|
|
118
|
-
results.push({
|
|
119
|
+
results.push(wrapDeterministic({
|
|
119
120
|
id: scenario.id,
|
|
120
121
|
agent: agentName,
|
|
121
122
|
name: scenario.name || scenario.id,
|
|
@@ -123,12 +124,12 @@ function runAllScenarios(scenarios, agentPrompts, opts = {}) {
|
|
|
123
124
|
checks: 0,
|
|
124
125
|
checks_passed: 0,
|
|
125
126
|
failures: [{ type: 'missing_prompt', passed: false, detail: `agent "${agentName}" not found in AGENT_PROMPTS` }],
|
|
126
|
-
});
|
|
127
|
+
}));
|
|
127
128
|
continue;
|
|
128
129
|
}
|
|
129
130
|
|
|
130
131
|
const result = runScenario(scenario, promptText || '');
|
|
131
|
-
results.push(result);
|
|
132
|
+
results.push(wrapDeterministic(result));
|
|
132
133
|
|
|
133
134
|
if (opts.failFast && result.result !== 'pass') break;
|
|
134
135
|
}
|
|
@@ -163,8 +164,10 @@ function formatResults(results, format = 'pretty') {
|
|
|
163
164
|
for (const [agent, scenarios] of Object.entries(byAgent)) {
|
|
164
165
|
lines.push(`${agent} (${scenarios.length} scenarios)`);
|
|
165
166
|
for (const s of scenarios) {
|
|
166
|
-
const
|
|
167
|
-
|
|
167
|
+
const verdict = s.verdict || s.result;
|
|
168
|
+
const icon = verdict === 'pass' ? 'PASS' : verdict === 'fail' ? 'FAIL' : verdict === 'inconclusive' ? '????' : 'ERR ';
|
|
169
|
+
const conf = s.confidence != null && !s.deterministic ? ` ${s.confidence}%` : '';
|
|
170
|
+
lines.push(` [${icon}] ${s.id}: ${s.name} (${s.checks_passed}/${s.checks} checks${conf})`);
|
|
168
171
|
for (const f of s.failures) {
|
|
169
172
|
lines.push(` x ${f.type}: ${f.detail}`);
|
|
170
173
|
}
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* eval-three-valued.js — HC-019j three-valued test outcomes.
|
|
4
|
+
*
|
|
5
|
+
* Replaces binary pass/fail with Pass/Fail/Inconclusive for
|
|
6
|
+
* non-deterministic evaluations (LLM-as-judge, HC-019i).
|
|
7
|
+
*
|
|
8
|
+
* For deterministic checks (current graders), results are always
|
|
9
|
+
* definitive — Pass or Fail, never Inconclusive.
|
|
10
|
+
*
|
|
11
|
+
* For non-deterministic checks (future LLM-judge), the same eval
|
|
12
|
+
* is run N times and outcomes are classified statistically:
|
|
13
|
+
* - Pass: >= passThreshold of runs passed (default 80%)
|
|
14
|
+
* - Fail: >= failThreshold of runs failed (default 80%)
|
|
15
|
+
* - Inconclusive: neither threshold met (needs more runs or investigation)
|
|
16
|
+
*
|
|
17
|
+
* Based on AgentAssay (ICLR 2026) three-valued probabilistic outcomes.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Classify a set of run results into Pass/Fail/Inconclusive.
|
|
22
|
+
*
|
|
23
|
+
* @param {boolean[]} outcomes — array of pass/fail booleans from multiple runs
|
|
24
|
+
* @param {object} [opts]
|
|
25
|
+
* @param {number} [opts.passThreshold=0.8] — fraction of passes needed for Pass
|
|
26
|
+
* @param {number} [opts.failThreshold=0.8] — fraction of fails needed for Fail
|
|
27
|
+
* @param {number} [opts.minRuns=1] — minimum runs before classifying
|
|
28
|
+
* @returns {{ verdict: 'pass'|'fail'|'inconclusive', confidence: number, runs, passed, failed, details }}
|
|
29
|
+
*/
|
|
30
|
+
function classify(outcomes, opts = {}) {
|
|
31
|
+
const { passThreshold = 0.8, failThreshold = 0.8, minRuns = 1 } = opts;
|
|
32
|
+
|
|
33
|
+
if (!outcomes || outcomes.length === 0) {
|
|
34
|
+
return { verdict: 'inconclusive', confidence: 0, runs: 0, passed: 0, failed: 0, details: 'no runs' };
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const runs = outcomes.length;
|
|
38
|
+
const passed = outcomes.filter(o => o === true).length;
|
|
39
|
+
const failed = runs - passed;
|
|
40
|
+
const passRate = passed / runs;
|
|
41
|
+
const failRate = failed / runs;
|
|
42
|
+
|
|
43
|
+
if (runs < minRuns) {
|
|
44
|
+
return {
|
|
45
|
+
verdict: 'inconclusive',
|
|
46
|
+
confidence: Math.round(passRate * 100),
|
|
47
|
+
runs, passed, failed,
|
|
48
|
+
details: `insufficient runs (${runs}/${minRuns})`,
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Pass-priority: if both thresholds could be met (e.g., 1 run),
|
|
53
|
+
// pass wins. This is optimistic — we assume the agent is correct
|
|
54
|
+
// unless proven otherwise with enough evidence.
|
|
55
|
+
if (passRate >= passThreshold) {
|
|
56
|
+
return {
|
|
57
|
+
verdict: 'pass',
|
|
58
|
+
confidence: Math.round(passRate * 100),
|
|
59
|
+
runs, passed, failed,
|
|
60
|
+
details: `${passed}/${runs} passed (${Math.round(passRate * 100)}% >= ${Math.round(passThreshold * 100)}% threshold)`,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if (failRate >= failThreshold) {
|
|
65
|
+
return {
|
|
66
|
+
verdict: 'fail',
|
|
67
|
+
confidence: Math.round(failRate * 100),
|
|
68
|
+
runs, passed, failed,
|
|
69
|
+
details: `${failed}/${runs} failed (${Math.round(failRate * 100)}% >= ${Math.round(failThreshold * 100)}% threshold)`,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
return {
|
|
74
|
+
verdict: 'inconclusive',
|
|
75
|
+
confidence: Math.round(Math.max(passRate, failRate) * 100),
|
|
76
|
+
runs, passed, failed,
|
|
77
|
+
details: `neither threshold met: ${Math.round(passRate * 100)}% pass, ${Math.round(failRate * 100)}% fail`,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Compute Wilson score confidence interval for a pass rate.
|
|
83
|
+
* Used to determine if more runs would change the verdict.
|
|
84
|
+
*
|
|
85
|
+
* @param {number} passed — number of passes
|
|
86
|
+
* @param {number} total — total runs
|
|
87
|
+
* @param {number} [z=1.96] — z-score for confidence level (1.96 = 95%)
|
|
88
|
+
* @returns {{ lower: number, upper: number, center: number }}
|
|
89
|
+
*/
|
|
90
|
+
function wilsonInterval(passed, total, z = 1.96) {
|
|
91
|
+
if (total === 0) return { lower: 0, upper: 1, center: 0.5 };
|
|
92
|
+
|
|
93
|
+
const p = passed / total;
|
|
94
|
+
const denominator = 1 + z * z / total;
|
|
95
|
+
const center = (p + z * z / (2 * total)) / denominator;
|
|
96
|
+
const margin = (z * Math.sqrt((p * (1 - p) + z * z / (4 * total)) / total)) / denominator;
|
|
97
|
+
|
|
98
|
+
return {
|
|
99
|
+
lower: Math.max(0, Math.round((center - margin) * 1000) / 1000),
|
|
100
|
+
upper: Math.min(1, Math.round((center + margin) * 1000) / 1000),
|
|
101
|
+
center: Math.round(center * 1000) / 1000,
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Recommend whether more runs would help resolve an inconclusive result.
|
|
107
|
+
*
|
|
108
|
+
* @param {object} result — from classify()
|
|
109
|
+
* @param {object} [opts]
|
|
110
|
+
* @param {number} [opts.maxRuns=10] — maximum recommended additional runs
|
|
111
|
+
* @returns {{ recommend: boolean, additionalRuns: number, reason: string }}
|
|
112
|
+
*/
|
|
113
|
+
function recommendMoreRuns(result, opts = {}) {
|
|
114
|
+
const { maxRuns = 10 } = opts;
|
|
115
|
+
|
|
116
|
+
if (result.verdict !== 'inconclusive') {
|
|
117
|
+
return { recommend: false, additionalRuns: 0, reason: 'verdict is definitive' };
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
if (result.runs === 0) {
|
|
121
|
+
return { recommend: true, additionalRuns: 3, reason: 'no runs yet' };
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const interval = wilsonInterval(result.passed, result.runs);
|
|
125
|
+
const spread = interval.upper - interval.lower;
|
|
126
|
+
|
|
127
|
+
// If spread is wide, more runs would help narrow it
|
|
128
|
+
if (spread > 0.3 && result.runs < maxRuns) {
|
|
129
|
+
const additional = Math.min(maxRuns - result.runs, Math.ceil(result.runs * 0.5) + 2);
|
|
130
|
+
return { recommend: true, additionalRuns: additional, reason: `wide confidence interval (${interval.lower}-${interval.upper})` };
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// If spread is narrow but still inconclusive, the result is genuinely borderline
|
|
134
|
+
return { recommend: false, additionalRuns: 0, reason: `borderline result (${interval.lower}-${interval.upper}), more runs unlikely to resolve` };
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Wrap a deterministic eval result as a three-valued outcome.
|
|
139
|
+
* Deterministic results are always definitive (never inconclusive).
|
|
140
|
+
*
|
|
141
|
+
* @param {object} scenarioResult — from runScenario()
|
|
142
|
+
* @returns {object} — same shape with verdict + confidence added
|
|
143
|
+
*/
|
|
144
|
+
function wrapDeterministic(scenarioResult) {
|
|
145
|
+
let verdict;
|
|
146
|
+
if (scenarioResult.result === 'pass') verdict = 'pass';
|
|
147
|
+
else if (scenarioResult.result === 'fail') verdict = 'fail';
|
|
148
|
+
else verdict = 'error'; // error means the eval itself broke, not flaky — distinct from inconclusive
|
|
149
|
+
|
|
150
|
+
return {
|
|
151
|
+
...scenarioResult,
|
|
152
|
+
verdict,
|
|
153
|
+
confidence: verdict === 'error' ? 0 : 100,
|
|
154
|
+
deterministic: true,
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
module.exports = { classify, wilsonInterval, recommendMoreRuns, wrapDeterministic };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@hone-ai/cli",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.7.1",
|
|
4
4
|
"description": "Hone AI — Enterprise SDLC Pipeline CLI",
|
|
5
5
|
"main": "hone-cli.js",
|
|
6
6
|
"bin": {
|
|
@@ -10,7 +10,8 @@
|
|
|
10
10
|
"bin/",
|
|
11
11
|
"hone-cli.js",
|
|
12
12
|
"lib/",
|
|
13
|
-
"!lib/*.test.js"
|
|
13
|
+
"!lib/*.test.js",
|
|
14
|
+
"schema/"
|
|
14
15
|
],
|
|
15
16
|
"scripts": {
|
|
16
17
|
"test": "echo \"No tests yet\" && exit 0",
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$id": "https://hone.ai/schema/metadata.schema.json",
|
|
4
|
+
"title": "Hone SDLC Pipeline Story Metadata",
|
|
5
|
+
"description": "Schema for .github/pipeline/<STORY-ID>/metadata.yml. Implements SC-010 §10 (metadata.yml as wire protocol) — fields read by 3+ agents must validate against this schema before agents run, otherwise typos silently degrade to no-op behavior.",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"additionalProperties": true,
|
|
8
|
+
"required": ["story_id", "title", "branch", "base", "steps"],
|
|
9
|
+
"properties": {
|
|
10
|
+
"story_id": {
|
|
11
|
+
"type": "string",
|
|
12
|
+
"minLength": 1,
|
|
13
|
+
"description": "Story identifier. Hone-server convention: SC-NNN, H-NNN, RP-NNN, AU-NNN, SR-NNN, HC-NNN; OptionsFlow convention: E-NNN-X. Adopters may use any pattern."
|
|
14
|
+
},
|
|
15
|
+
"title": {
|
|
16
|
+
"type": "string",
|
|
17
|
+
"minLength": 1,
|
|
18
|
+
"description": "Human-readable story title."
|
|
19
|
+
},
|
|
20
|
+
"issue": {
|
|
21
|
+
"type": ["integer", "string", "null"],
|
|
22
|
+
"description": "GitHub issue reference. Modern convention: integer issue number or null. Legacy convention: full GitHub issue URL string. Both accepted."
|
|
23
|
+
},
|
|
24
|
+
"branch": {
|
|
25
|
+
"type": "string",
|
|
26
|
+
"minLength": 1,
|
|
27
|
+
"description": "Git branch name. Convention: feat/<STORY-ID>-<slug> | fix/<STORY-ID>-<slug> | chore/..."
|
|
28
|
+
},
|
|
29
|
+
"base": {
|
|
30
|
+
"type": "string",
|
|
31
|
+
"minLength": 1,
|
|
32
|
+
"description": "Base branch the story merges to. Convention: develop. Stacked story chains (story B branched off feature/A) may use feat/...-style base names. Adopter override schemas may tighten the enum."
|
|
33
|
+
},
|
|
34
|
+
"captured_at": {
|
|
35
|
+
"type": ["string", "null"],
|
|
36
|
+
"description": "ISO date when the story was captured (YYYY-MM-DD). May be null."
|
|
37
|
+
},
|
|
38
|
+
"type": {
|
|
39
|
+
"type": "string",
|
|
40
|
+
"enum": ["feature", "enhancement", "bug", "bug_fix", "chore", "refactor", "docs", "meta-epic", "fix"],
|
|
41
|
+
"description": "Story type per SC-001 classifier vocabulary. 'bug_fix' is a legacy alias for 'bug'/'fix'; both accepted."
|
|
42
|
+
},
|
|
43
|
+
"priority": {
|
|
44
|
+
"type": "string",
|
|
45
|
+
"description": "Adopter priority. Optional. No canonical enum — different stories use P0/P1/P2/P3, high/medium/low, M, low-medium, etc. Adopters with strict policies override via --schema."
|
|
46
|
+
},
|
|
47
|
+
"phase": {
|
|
48
|
+
"type": ["string", "number"],
|
|
49
|
+
"description": "Story lifecycle phase. Modern convention: string ('backlog', 'in-progress', 'blocked', 'done', 'completed'). Legacy convention: numeric phase identifier (e.g. 6.2, 2.1). Both accepted; adopter override schema may tighten."
|
|
50
|
+
},
|
|
51
|
+
"story_type": {
|
|
52
|
+
"type": "string",
|
|
53
|
+
"description": "Free-form story type label (legacy field; prefer 'type' for new stories)."
|
|
54
|
+
},
|
|
55
|
+
"fast_track": {
|
|
56
|
+
"type": "boolean",
|
|
57
|
+
"description": "True if story is on the fast-track pipeline (skip steps 0-3 gates). Per SC-001 classifier output."
|
|
58
|
+
},
|
|
59
|
+
"hot_fix": {
|
|
60
|
+
"type": "boolean",
|
|
61
|
+
"description": "True if story is on the hot-fix pipeline. Per SC-001 classifier output."
|
|
62
|
+
},
|
|
63
|
+
"fix_for": {
|
|
64
|
+
"type": ["string", "null"],
|
|
65
|
+
"description": "Story ID this fixes (for regression-test policy per H-030 + SC-009 §Guardrail-before-fix). Null when this story is not a fix."
|
|
66
|
+
},
|
|
67
|
+
"parent_story": {
|
|
68
|
+
"type": ["string", "null"],
|
|
69
|
+
"description": "Parent story ID for sub-stories or follow-ups. Null at top level."
|
|
70
|
+
},
|
|
71
|
+
"sibling_pipelines": {
|
|
72
|
+
"type": ["array", "null"],
|
|
73
|
+
"items": { "type": "string" },
|
|
74
|
+
"description": "Story IDs sharing this pipeline. Used for multi-story epics."
|
|
75
|
+
},
|
|
76
|
+
"author": {
|
|
77
|
+
"type": ["string", "null"]
|
|
78
|
+
},
|
|
79
|
+
"created": {
|
|
80
|
+
"type": ["string", "null"]
|
|
81
|
+
},
|
|
82
|
+
"base_sha": {
|
|
83
|
+
"type": ["string", "number", "null"],
|
|
84
|
+
"description": "Base commit SHA. Permissive type — hex-like SHA strings (92192e7) get parsed as numbers by js-yaml. Adopters who want strict SHA strings can override schema."
|
|
85
|
+
},
|
|
86
|
+
"steps": {
|
|
87
|
+
"type": "object",
|
|
88
|
+
"additionalProperties": false,
|
|
89
|
+
"description": "Pipeline step status table. Wire-protocol object — no extra step IDs allowed.",
|
|
90
|
+
"properties": {
|
|
91
|
+
"step_0": { "$ref": "#/$defs/step" },
|
|
92
|
+
"step_1": { "$ref": "#/$defs/step" },
|
|
93
|
+
"step_2": { "$ref": "#/$defs/step" },
|
|
94
|
+
"step_3a": { "$ref": "#/$defs/step" },
|
|
95
|
+
"step_3b": { "$ref": "#/$defs/step" },
|
|
96
|
+
"step_4": { "$ref": "#/$defs/step" },
|
|
97
|
+
"step_5": { "$ref": "#/$defs/step" },
|
|
98
|
+
"step_5b": { "$ref": "#/$defs/step" },
|
|
99
|
+
"step_5c": { "$ref": "#/$defs/step" }
|
|
100
|
+
}
|
|
101
|
+
},
|
|
102
|
+
"cross_validation": {
|
|
103
|
+
"type": "object",
|
|
104
|
+
"additionalProperties": true,
|
|
105
|
+
"description": "Cross-validation findings per H-035 + SC-001..SC-005."
|
|
106
|
+
},
|
|
107
|
+
"self_applied_classifier": {
|
|
108
|
+
"type": "object",
|
|
109
|
+
"additionalProperties": true,
|
|
110
|
+
"description": "SC-001 classifier output recorded for the story. Tracks the classification decision so future readers can audit the routing."
|
|
111
|
+
}
|
|
112
|
+
},
|
|
113
|
+
"$defs": {
|
|
114
|
+
"step": {
|
|
115
|
+
"type": "object",
|
|
116
|
+
"additionalProperties": true,
|
|
117
|
+
"description": "Per-step status. The wire-protocol fields (status, agent) are validated; adopter and historical extension fields (acceptance_criteria_count, acs_met, automation_rationale, bug_caught_in_step, etc.) are passed through. Set additionalProperties: false in your adopter override schema for stricter enforcement.",
|
|
118
|
+
"properties": {
|
|
119
|
+
"status": {
|
|
120
|
+
"type": "string",
|
|
121
|
+
"description": "Step status. Hone-server uses 'completed'/'in_progress'/'skipped'; older files may use 'complete'/'in-progress' — both accepted. Adopter override schemas may tighten the enum."
|
|
122
|
+
},
|
|
123
|
+
"agent": {
|
|
124
|
+
"type": "string",
|
|
125
|
+
"description": "Agent ID (story-groomer, implementation-planner, unit-test-writer, e2e-qa-planner, e2e-test-spec-writer, code-builder, code-reviewer, delivery-architect, etc.)."
|
|
126
|
+
},
|
|
127
|
+
"artifact": {
|
|
128
|
+
"type": ["string", "null"],
|
|
129
|
+
"description": "Path or filename of the step's artifact (e.g., step-0-grooming.md). May be null if the step is pending."
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|