@hone-ai/cli 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,213 @@
1
+ 'use strict';
2
+ /**
3
+ * eval-llm-judge.js — HC-019i LLM-as-judge evaluator.
4
+ *
5
+ * Uses an LLM to assess agent prompt quality against criteria that
6
+ * deterministic graders can't check (semantic meaning, completeness,
7
+ * reasoning quality).
8
+ *
9
+ * Pure helper with injected LLM call function.
10
+ * Integrates with HC-019j three-valued outcomes for non-deterministic results.
11
+ */
12
+ const { classify, wrapDeterministic } = require('./eval-three-valued');
13
+
14
+ /**
15
+ * Judge criteria for LLM evaluation.
16
+ * Each criterion is a question the LLM answers YES/NO about the agent output.
17
+ *
18
+ * Eval scenario format for LLM-judge mode:
19
+ * ```yaml
20
+ * grading:
21
+ * mode: llm-judge
22
+ * criteria:
23
+ * - "Does the prompt clearly define the agent's role and responsibilities?"
24
+ * - "Does the prompt include error handling guidance?"
25
+ * - "Is the output format specification unambiguous?"
26
+ * runs: 3 # optional, default 1 for cost savings
27
+ * ```
28
+ */
29
+
30
+ const JUDGE_SYSTEM_PROMPT = `You are an eval judge for AI agent prompts. You will be given an agent prompt and a list of criteria. For each criterion, answer YES or NO with a brief explanation.
31
+
32
+ Rules:
33
+ - Answer ONLY YES or NO for each criterion, followed by a one-sentence explanation
34
+ - Be strict — if the criterion is not clearly met, answer NO
35
+ - Format each answer on its own line as: "CRITERION_N: YES|NO — explanation"
36
+ - Do not add commentary outside the criterion answers`;
37
+
38
+ /**
39
+ * Build the judge prompt.
40
+ * @param {string} agentPrompt — the agent prompt text to evaluate
41
+ * @param {string[]} criteria — list of criteria to judge against
42
+ * @returns {string}
43
+ */
44
+ function buildJudgePrompt(agentPrompt, criteria) {
45
+ const criteriaList = criteria
46
+ .map((c, i) => `CRITERION_${i + 1}: ${c}`)
47
+ .join('\n');
48
+
49
+ return `## Agent Prompt to Evaluate
50
+
51
+ ${agentPrompt.slice(0, 8000)}
52
+
53
+ ## Criteria to Judge
54
+
55
+ ${criteriaList}
56
+
57
+ ## Your Judgement
58
+
59
+ For each criterion, answer YES or NO with a brief explanation:`;
60
+ }
61
+
62
+ /**
63
+ * Parse the LLM judge response into structured results.
64
+ * @param {string} response — LLM output
65
+ * @param {number} criteriaCount — expected number of criteria
66
+ * @returns {Array<{ criterion: number, passed: boolean, explanation: string }>}
67
+ */
68
+ function parseJudgeResponse(response, criteriaCount) {
69
+ const results = [];
70
+
71
+ for (let i = 1; i <= criteriaCount; i++) {
72
+ const pattern = new RegExp(`CRITERION_${i}:\\s*(YES|NO)\\s*[-—]\\s*(.+)`, 'i');
73
+ const match = response.match(pattern);
74
+
75
+ if (match) {
76
+ results.push({
77
+ criterion: i,
78
+ passed: match[1].toUpperCase() === 'YES',
79
+ explanation: match[2].trim(),
80
+ });
81
+ } else {
82
+ // Try looser pattern
83
+ const loosePattern = new RegExp(`(?:CRITERION_${i}|#${i}|${i}\\.)\\s*:?\\s*(YES|NO)`, 'i');
84
+ const looseMatch = response.match(loosePattern);
85
+ results.push({
86
+ criterion: i,
87
+ passed: looseMatch ? looseMatch[1].toUpperCase() === 'YES' : false,
88
+ explanation: looseMatch ? 'parsed from loose format' : 'could not parse response',
89
+ });
90
+ }
91
+ }
92
+
93
+ return results;
94
+ }
95
+
96
+ /**
97
+ * Run a single LLM-judge evaluation.
98
+ * @param {object} opts
99
+ * @param {string} opts.agentPrompt — prompt text to evaluate
100
+ * @param {string[]} opts.criteria — list of criteria
101
+ * @param {(systemPrompt: string, userPrompt: string) => Promise<string>} opts.callLLM — injected LLM call
102
+ * @returns {Promise<{ passed: boolean, criteriaResults: Array, rawResponse: string }>}
103
+ */
104
+ async function runJudge({ agentPrompt, criteria, callLLM }) {
105
+ const userPrompt = buildJudgePrompt(agentPrompt, criteria);
106
+ const response = await callLLM(JUDGE_SYSTEM_PROMPT, userPrompt);
107
+ const criteriaResults = parseJudgeResponse(response, criteria.length);
108
+ const allPassed = criteriaResults.every(r => r.passed);
109
+
110
+ return {
111
+ passed: allPassed,
112
+ criteriaResults,
113
+ rawResponse: response,
114
+ };
115
+ }
116
+
117
+ /**
118
+ * Run LLM-judge evaluation with optional multiple runs for three-valued outcomes.
119
+ * @param {object} opts
120
+ * @param {object} opts.scenario — eval scenario with grading.mode = 'llm-judge'
121
+ * @param {string} opts.agentPrompt — prompt text
122
+ * @param {(systemPrompt: string, userPrompt: string) => Promise<string>} opts.callLLM
123
+ * @param {object} [opts.thresholds] — pass/fail thresholds for classify()
124
+ * @returns {Promise<object>} — scenario result with verdict + confidence
125
+ */
126
+ async function runJudgeScenario({ scenario, agentPrompt, callLLM, thresholds }) {
127
+ const criteria = scenario.grading?.criteria || [];
128
+ const runs = scenario.grading?.runs || 1;
129
+
130
+ if (criteria.length === 0) {
131
+ return wrapDeterministic({
132
+ id: scenario.id,
133
+ agent: scenario.evalAgent || scenario.agent,
134
+ name: scenario.name || scenario.id,
135
+ result: 'error',
136
+ checks: 0,
137
+ checks_passed: 0,
138
+ failures: [{ type: 'config', passed: false, detail: 'no criteria defined for llm-judge' }],
139
+ });
140
+ }
141
+
142
+ // Single run — return deterministic result
143
+ if (runs <= 1) {
144
+ try {
145
+ const judgeResult = await runJudge({ agentPrompt, criteria, callLLM });
146
+ return wrapDeterministic({
147
+ id: scenario.id,
148
+ agent: scenario.evalAgent || scenario.agent,
149
+ name: scenario.name || scenario.id,
150
+ result: judgeResult.passed ? 'pass' : 'fail',
151
+ checks: criteria.length,
152
+ checks_passed: judgeResult.criteriaResults.filter(r => r.passed).length,
153
+ failures: judgeResult.criteriaResults
154
+ .filter(r => !r.passed)
155
+ .map(r => ({ type: `criterion_${r.criterion}`, passed: false, detail: r.explanation })),
156
+ judgeDetails: judgeResult.criteriaResults,
157
+ });
158
+ } catch (e) {
159
+ return wrapDeterministic({
160
+ id: scenario.id,
161
+ agent: scenario.evalAgent || scenario.agent,
162
+ name: scenario.name || scenario.id,
163
+ result: 'error',
164
+ checks: 0,
165
+ checks_passed: 0,
166
+ failures: [{ type: 'llm_error', passed: false, detail: e.message }],
167
+ });
168
+ }
169
+ }
170
+
171
+ // Multiple runs — use three-valued classification
172
+ const outcomes = [];
173
+ const allDetails = [];
174
+
175
+ for (let i = 0; i < runs; i++) {
176
+ try {
177
+ const judgeResult = await runJudge({ agentPrompt, criteria, callLLM });
178
+ outcomes.push(judgeResult.passed);
179
+ allDetails.push(judgeResult);
180
+ } catch (e) {
181
+ outcomes.push(false);
182
+ allDetails.push({ passed: false, error: e.message });
183
+ }
184
+ }
185
+
186
+ const classification = classify(outcomes, thresholds);
187
+
188
+ return {
189
+ id: scenario.id,
190
+ agent: scenario.evalAgent || scenario.agent,
191
+ name: scenario.name || scenario.id,
192
+ result: classification.verdict,
193
+ verdict: classification.verdict,
194
+ confidence: classification.confidence,
195
+ deterministic: false,
196
+ checks: criteria.length,
197
+ checks_passed: classification.verdict === 'pass' ? criteria.length : 0,
198
+ runs_passed: classification.passed,
199
+ failures: classification.verdict === 'fail'
200
+ ? [{ type: 'llm_judge', passed: false, detail: classification.details }]
201
+ : [],
202
+ runs: classification.runs,
203
+ runDetails: allDetails,
204
+ };
205
+ }
206
+
207
+ module.exports = {
208
+ JUDGE_SYSTEM_PROMPT,
209
+ buildJudgePrompt,
210
+ parseJudgeResponse,
211
+ runJudge,
212
+ runJudgeScenario,
213
+ };
@@ -8,6 +8,7 @@
8
8
  * Pure helper with injected I/O (readFile, listDir).
9
9
  */
10
10
  const { runCheck } = require('./eval-graders');
11
+ const { wrapDeterministic } = require('./eval-three-valued');
11
12
 
12
13
  /**
13
14
  * Load eval scenarios from the evals directory.
@@ -115,7 +116,7 @@ function runAllScenarios(scenarios, agentPrompts, opts = {}) {
115
116
  const promptText = agentPrompts[agentName];
116
117
 
117
118
  if (!promptText && !scenario.loadError) {
118
- results.push({
119
+ results.push(wrapDeterministic({
119
120
  id: scenario.id,
120
121
  agent: agentName,
121
122
  name: scenario.name || scenario.id,
@@ -123,12 +124,12 @@ function runAllScenarios(scenarios, agentPrompts, opts = {}) {
123
124
  checks: 0,
124
125
  checks_passed: 0,
125
126
  failures: [{ type: 'missing_prompt', passed: false, detail: `agent "${agentName}" not found in AGENT_PROMPTS` }],
126
- });
127
+ }));
127
128
  continue;
128
129
  }
129
130
 
130
131
  const result = runScenario(scenario, promptText || '');
131
- results.push(result);
132
+ results.push(wrapDeterministic(result));
132
133
 
133
134
  if (opts.failFast && result.result !== 'pass') break;
134
135
  }
@@ -163,8 +164,10 @@ function formatResults(results, format = 'pretty') {
163
164
  for (const [agent, scenarios] of Object.entries(byAgent)) {
164
165
  lines.push(`${agent} (${scenarios.length} scenarios)`);
165
166
  for (const s of scenarios) {
166
- const icon = s.result === 'pass' ? 'PASS' : s.result === 'fail' ? 'FAIL' : 'ERR ';
167
- lines.push(` [${icon}] ${s.id}: ${s.name} (${s.checks_passed}/${s.checks} checks)`);
167
+ const verdict = s.verdict || s.result;
168
+ const icon = verdict === 'pass' ? 'PASS' : verdict === 'fail' ? 'FAIL' : verdict === 'inconclusive' ? '????' : 'ERR ';
169
+ const conf = s.confidence != null && !s.deterministic ? ` ${s.confidence}%` : '';
170
+ lines.push(` [${icon}] ${s.id}: ${s.name} (${s.checks_passed}/${s.checks} checks${conf})`);
168
171
  for (const f of s.failures) {
169
172
  lines.push(` x ${f.type}: ${f.detail}`);
170
173
  }
@@ -0,0 +1,158 @@
1
+ 'use strict';
2
+ /**
3
+ * eval-three-valued.js — HC-019j three-valued test outcomes.
4
+ *
5
+ * Replaces binary pass/fail with Pass/Fail/Inconclusive for
6
+ * non-deterministic evaluations (LLM-as-judge, HC-019i).
7
+ *
8
+ * For deterministic checks (current graders), results are always
9
+ * definitive — Pass or Fail, never Inconclusive.
10
+ *
11
+ * For non-deterministic checks (future LLM-judge), the same eval
12
+ * is run N times and outcomes are classified statistically:
13
+ * - Pass: >= passThreshold of runs passed (default 80%)
14
+ * - Fail: >= failThreshold of runs failed (default 80%)
15
+ * - Inconclusive: neither threshold met (needs more runs or investigation)
16
+ *
17
+ * Based on AgentAssay (ICLR 2026) three-valued probabilistic outcomes.
18
+ */
19
+
20
+ /**
21
+ * Classify a set of run results into Pass/Fail/Inconclusive.
22
+ *
23
+ * @param {boolean[]} outcomes — array of pass/fail booleans from multiple runs
24
+ * @param {object} [opts]
25
+ * @param {number} [opts.passThreshold=0.8] — fraction of passes needed for Pass
26
+ * @param {number} [opts.failThreshold=0.8] — fraction of fails needed for Fail
27
+ * @param {number} [opts.minRuns=1] — minimum runs before classifying
28
+ * @returns {{ verdict: 'pass'|'fail'|'inconclusive', confidence: number, runs, passed, failed, details }}
29
+ */
30
+ function classify(outcomes, opts = {}) {
31
+ const { passThreshold = 0.8, failThreshold = 0.8, minRuns = 1 } = opts;
32
+
33
+ if (!outcomes || outcomes.length === 0) {
34
+ return { verdict: 'inconclusive', confidence: 0, runs: 0, passed: 0, failed: 0, details: 'no runs' };
35
+ }
36
+
37
+ const runs = outcomes.length;
38
+ const passed = outcomes.filter(o => o === true).length;
39
+ const failed = runs - passed;
40
+ const passRate = passed / runs;
41
+ const failRate = failed / runs;
42
+
43
+ if (runs < minRuns) {
44
+ return {
45
+ verdict: 'inconclusive',
46
+ confidence: Math.round(passRate * 100),
47
+ runs, passed, failed,
48
+ details: `insufficient runs (${runs}/${minRuns})`,
49
+ };
50
+ }
51
+
52
+ // Pass-priority: if both thresholds could be met (e.g., 1 run),
53
+ // pass wins. This is optimistic — we assume the agent is correct
54
+ // unless proven otherwise with enough evidence.
55
+ if (passRate >= passThreshold) {
56
+ return {
57
+ verdict: 'pass',
58
+ confidence: Math.round(passRate * 100),
59
+ runs, passed, failed,
60
+ details: `${passed}/${runs} passed (${Math.round(passRate * 100)}% >= ${Math.round(passThreshold * 100)}% threshold)`,
61
+ };
62
+ }
63
+
64
+ if (failRate >= failThreshold) {
65
+ return {
66
+ verdict: 'fail',
67
+ confidence: Math.round(failRate * 100),
68
+ runs, passed, failed,
69
+ details: `${failed}/${runs} failed (${Math.round(failRate * 100)}% >= ${Math.round(failThreshold * 100)}% threshold)`,
70
+ };
71
+ }
72
+
73
+ return {
74
+ verdict: 'inconclusive',
75
+ confidence: Math.round(Math.max(passRate, failRate) * 100),
76
+ runs, passed, failed,
77
+ details: `neither threshold met: ${Math.round(passRate * 100)}% pass, ${Math.round(failRate * 100)}% fail`,
78
+ };
79
+ }
80
+
81
+ /**
82
+ * Compute Wilson score confidence interval for a pass rate.
83
+ * Used to determine if more runs would change the verdict.
84
+ *
85
+ * @param {number} passed — number of passes
86
+ * @param {number} total — total runs
87
+ * @param {number} [z=1.96] — z-score for confidence level (1.96 = 95%)
88
+ * @returns {{ lower: number, upper: number, center: number }}
89
+ */
90
+ function wilsonInterval(passed, total, z = 1.96) {
91
+ if (total === 0) return { lower: 0, upper: 1, center: 0.5 };
92
+
93
+ const p = passed / total;
94
+ const denominator = 1 + z * z / total;
95
+ const center = (p + z * z / (2 * total)) / denominator;
96
+ const margin = (z * Math.sqrt((p * (1 - p) + z * z / (4 * total)) / total)) / denominator;
97
+
98
+ return {
99
+ lower: Math.max(0, Math.round((center - margin) * 1000) / 1000),
100
+ upper: Math.min(1, Math.round((center + margin) * 1000) / 1000),
101
+ center: Math.round(center * 1000) / 1000,
102
+ };
103
+ }
104
+
105
+ /**
106
+ * Recommend whether more runs would help resolve an inconclusive result.
107
+ *
108
+ * @param {object} result — from classify()
109
+ * @param {object} [opts]
110
+ * @param {number} [opts.maxRuns=10] — maximum recommended additional runs
111
+ * @returns {{ recommend: boolean, additionalRuns: number, reason: string }}
112
+ */
113
+ function recommendMoreRuns(result, opts = {}) {
114
+ const { maxRuns = 10 } = opts;
115
+
116
+ if (result.verdict !== 'inconclusive') {
117
+ return { recommend: false, additionalRuns: 0, reason: 'verdict is definitive' };
118
+ }
119
+
120
+ if (result.runs === 0) {
121
+ return { recommend: true, additionalRuns: 3, reason: 'no runs yet' };
122
+ }
123
+
124
+ const interval = wilsonInterval(result.passed, result.runs);
125
+ const spread = interval.upper - interval.lower;
126
+
127
+ // If spread is wide, more runs would help narrow it
128
+ if (spread > 0.3 && result.runs < maxRuns) {
129
+ const additional = Math.min(maxRuns - result.runs, Math.ceil(result.runs * 0.5) + 2);
130
+ return { recommend: true, additionalRuns: additional, reason: `wide confidence interval (${interval.lower}-${interval.upper})` };
131
+ }
132
+
133
+ // If spread is narrow but still inconclusive, the result is genuinely borderline
134
+ return { recommend: false, additionalRuns: 0, reason: `borderline result (${interval.lower}-${interval.upper}), more runs unlikely to resolve` };
135
+ }
136
+
137
+ /**
138
+ * Wrap a deterministic eval result as a three-valued outcome.
139
+ * Deterministic results are always definitive (never inconclusive).
140
+ *
141
+ * @param {object} scenarioResult — from runScenario()
142
+ * @returns {object} — same shape with verdict + confidence added
143
+ */
144
+ function wrapDeterministic(scenarioResult) {
145
+ let verdict;
146
+ if (scenarioResult.result === 'pass') verdict = 'pass';
147
+ else if (scenarioResult.result === 'fail') verdict = 'fail';
148
+ else verdict = 'error'; // error means the eval itself broke, not flaky — distinct from inconclusive
149
+
150
+ return {
151
+ ...scenarioResult,
152
+ verdict,
153
+ confidence: verdict === 'error' ? 0 : 100,
154
+ deterministic: true,
155
+ };
156
+ }
157
+
158
+ module.exports = { classify, wilsonInterval, recommendMoreRuns, wrapDeterministic };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hone-ai/cli",
3
- "version": "1.6.0",
3
+ "version": "1.7.0",
4
4
  "description": "Hone AI — Enterprise SDLC Pipeline CLI",
5
5
  "main": "hone-cli.js",
6
6
  "bin": {