npm - @hone-ai/cli - Versions diffs - 1.6.0 → 1.7.0 - Mend

@hone-ai/cli 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/lib/eval-llm-judge.js +213 -0
package/lib/eval-runner.js +8 -5
package/lib/eval-three-valued.js +158 -0
package/package.json +1 -1

package/lib/eval-llm-judge.js ADDED Viewed

@@ -0,0 +1,213 @@
+'use strict';
+/**
+ * eval-llm-judge.js — HC-019i LLM-as-judge evaluator.
+ *
+ * Uses an LLM to assess agent prompt quality against criteria that
+ * deterministic graders can't check (semantic meaning, completeness,
+ * reasoning quality).
+ *
+ * Pure helper with injected LLM call function.
+ * Integrates with HC-019j three-valued outcomes for non-deterministic results.
+ */
+const { classify, wrapDeterministic } = require('./eval-three-valued');
+/**
+ * Judge criteria for LLM evaluation.
+ * Each criterion is a question the LLM answers YES/NO about the agent output.
+ *
+ * Eval scenario format for LLM-judge mode:
+ * ```yaml
+ * grading:
+ *   mode: llm-judge
+ *   criteria:
+ *     - "Does the prompt clearly define the agent's role and responsibilities?"
+ *     - "Does the prompt include error handling guidance?"
+ *     - "Is the output format specification unambiguous?"
+ *   runs: 3  # optional, default 1 for cost savings
+ * ```
+ */
+const JUDGE_SYSTEM_PROMPT = `You are an eval judge for AI agent prompts. You will be given an agent prompt and a list of criteria. For each criterion, answer YES or NO with a brief explanation.
+Rules:
+- Answer ONLY YES or NO for each criterion, followed by a one-sentence explanation
+- Be strict — if the criterion is not clearly met, answer NO
+- Format each answer on its own line as: "CRITERION_N: YES|NO — explanation"
+- Do not add commentary outside the criterion answers`;
+/**
+ * Build the judge prompt.
+ * @param {string} agentPrompt — the agent prompt text to evaluate
+ * @param {string[]} criteria — list of criteria to judge against
+ * @returns {string}
+ */
+function buildJudgePrompt(agentPrompt, criteria) {
+  const criteriaList = criteria
+    .map((c, i) => `CRITERION_${i + 1}: ${c}`)
+    .join('\n');
+  return `## Agent Prompt to Evaluate
+${agentPrompt.slice(0, 8000)}
+## Criteria to Judge
+${criteriaList}
+## Your Judgement
+For each criterion, answer YES or NO with a brief explanation:`;
+}
+/**
+ * Parse the LLM judge response into structured results.
+ * @param {string} response — LLM output
+ * @param {number} criteriaCount — expected number of criteria
+ * @returns {Array<{ criterion: number, passed: boolean, explanation: string }>}
+ */
+function parseJudgeResponse(response, criteriaCount) {
+  const results = [];
+  for (let i = 1; i <= criteriaCount; i++) {
+    const pattern = new RegExp(`CRITERION_${i}:\\s*(YES|NO)\\s*[-—]\\s*(.+)`, 'i');
+    const match = response.match(pattern);
+    if (match) {
+      results.push({
+        criterion: i,
+        passed: match[1].toUpperCase() === 'YES',
+        explanation: match[2].trim(),
+      });
+    } else {
+      // Try looser pattern
+      const loosePattern = new RegExp(`(?:CRITERION_${i}|#${i}|${i}\\.)\\s*:?\\s*(YES|NO)`, 'i');
+      const looseMatch = response.match(loosePattern);
+      results.push({
+        criterion: i,
+        passed: looseMatch ? looseMatch[1].toUpperCase() === 'YES' : false,
+        explanation: looseMatch ? 'parsed from loose format' : 'could not parse response',
+      });
+    }
+  }
+  return results;
+}
+/**
+ * Run a single LLM-judge evaluation.
+ * @param {object} opts
+ * @param {string} opts.agentPrompt — prompt text to evaluate
+ * @param {string[]} opts.criteria — list of criteria
+ * @param {(systemPrompt: string, userPrompt: string) => Promise<string>} opts.callLLM — injected LLM call
+ * @returns {Promise<{ passed: boolean, criteriaResults: Array, rawResponse: string }>}
+ */
+async function runJudge({ agentPrompt, criteria, callLLM }) {
+  const userPrompt = buildJudgePrompt(agentPrompt, criteria);
+  const response = await callLLM(JUDGE_SYSTEM_PROMPT, userPrompt);
+  const criteriaResults = parseJudgeResponse(response, criteria.length);
+  const allPassed = criteriaResults.every(r => r.passed);
+  return {
+    passed: allPassed,
+    criteriaResults,
+    rawResponse: response,
+  };
+}
+/**
+ * Run LLM-judge evaluation with optional multiple runs for three-valued outcomes.
+ * @param {object} opts
+ * @param {object} opts.scenario — eval scenario with grading.mode = 'llm-judge'
+ * @param {string} opts.agentPrompt — prompt text
+ * @param {(systemPrompt: string, userPrompt: string) => Promise<string>} opts.callLLM
+ * @param {object} [opts.thresholds] — pass/fail thresholds for classify()
+ * @returns {Promise<object>} — scenario result with verdict + confidence
+ */
+async function runJudgeScenario({ scenario, agentPrompt, callLLM, thresholds }) {
+  const criteria = scenario.grading?.criteria || [];
+  const runs = scenario.grading?.runs || 1;
+  if (criteria.length === 0) {
+    return wrapDeterministic({
+      id: scenario.id,
+      agent: scenario.evalAgent || scenario.agent,
+      name: scenario.name || scenario.id,
+      result: 'error',
+      checks: 0,
+      checks_passed: 0,
+      failures: [{ type: 'config', passed: false, detail: 'no criteria defined for llm-judge' }],
+    });
+  }
+  // Single run — return deterministic result
+  if (runs <= 1) {
+    try {
+      const judgeResult = await runJudge({ agentPrompt, criteria, callLLM });
+      return wrapDeterministic({
+        id: scenario.id,
+        agent: scenario.evalAgent || scenario.agent,
+        name: scenario.name || scenario.id,
+        result: judgeResult.passed ? 'pass' : 'fail',
+        checks: criteria.length,
+        checks_passed: judgeResult.criteriaResults.filter(r => r.passed).length,
+        failures: judgeResult.criteriaResults
+          .filter(r => !r.passed)
+          .map(r => ({ type: `criterion_${r.criterion}`, passed: false, detail: r.explanation })),
+        judgeDetails: judgeResult.criteriaResults,
+      });
+    } catch (e) {
+      return wrapDeterministic({
+        id: scenario.id,
+        agent: scenario.evalAgent || scenario.agent,
+        name: scenario.name || scenario.id,
+        result: 'error',
+        checks: 0,
+        checks_passed: 0,
+        failures: [{ type: 'llm_error', passed: false, detail: e.message }],
+      });
+    }
+  }
+  // Multiple runs — use three-valued classification
+  const outcomes = [];
+  const allDetails = [];
+  for (let i = 0; i < runs; i++) {
+    try {
+      const judgeResult = await runJudge({ agentPrompt, criteria, callLLM });
+      outcomes.push(judgeResult.passed);
+      allDetails.push(judgeResult);
+    } catch (e) {
+      outcomes.push(false);
+      allDetails.push({ passed: false, error: e.message });
+    }
+  }
+  const classification = classify(outcomes, thresholds);
+  return {
+    id: scenario.id,
+    agent: scenario.evalAgent || scenario.agent,
+    name: scenario.name || scenario.id,
+    result: classification.verdict,
+    verdict: classification.verdict,
+    confidence: classification.confidence,
+    deterministic: false,
+    checks: criteria.length,
+    checks_passed: classification.verdict === 'pass' ? criteria.length : 0,
+    runs_passed: classification.passed,
+    failures: classification.verdict === 'fail'
+      ? [{ type: 'llm_judge', passed: false, detail: classification.details }]
+      : [],
+    runs: classification.runs,
+    runDetails: allDetails,
+  };
+}
+module.exports = {
+  JUDGE_SYSTEM_PROMPT,
+  buildJudgePrompt,
+  parseJudgeResponse,
+  runJudge,
+  runJudgeScenario,
+};

package/lib/eval-runner.js CHANGED Viewed

@@ -8,6 +8,7 @@
  * Pure helper with injected I/O (readFile, listDir).
  */
 const { runCheck } = require('./eval-graders');
+const { wrapDeterministic } = require('./eval-three-valued');
 /**
  * Load eval scenarios from the evals directory.
@@ -115,7 +116,7 @@ function runAllScenarios(scenarios, agentPrompts, opts = {}) {
     const promptText = agentPrompts[agentName];
     if (!promptText && !scenario.loadError) {
-      results.push({
+      results.push(wrapDeterministic({
         id: scenario.id,
         agent: agentName,
         name: scenario.name || scenario.id,
@@ -123,12 +124,12 @@ function runAllScenarios(scenarios, agentPrompts, opts = {}) {
         checks: 0,
         checks_passed: 0,
         failures: [{ type: 'missing_prompt', passed: false, detail: `agent "${agentName}" not found in AGENT_PROMPTS` }],
-      });
+      }));
       continue;
     }
     const result = runScenario(scenario, promptText || '');
-    results.push(result);
+    results.push(wrapDeterministic(result));
     if (opts.failFast && result.result !== 'pass') break;
   }
@@ -163,8 +164,10 @@ function formatResults(results, format = 'pretty') {
   for (const [agent, scenarios] of Object.entries(byAgent)) {
     lines.push(`${agent} (${scenarios.length} scenarios)`);
     for (const s of scenarios) {
-      const icon = s.result === 'pass' ? 'PASS' : s.result === 'fail' ? 'FAIL' : 'ERR ';
-      lines.push(`  [${icon}] ${s.id}: ${s.name} (${s.checks_passed}/${s.checks} checks)`);
+      const verdict = s.verdict || s.result;
+      const icon = verdict === 'pass' ? 'PASS' : verdict === 'fail' ? 'FAIL' : verdict === 'inconclusive' ? '????' : 'ERR ';
+      const conf = s.confidence != null && !s.deterministic ? ` ${s.confidence}%` : '';
+      lines.push(`  [${icon}] ${s.id}: ${s.name} (${s.checks_passed}/${s.checks} checks${conf})`);
       for (const f of s.failures) {
         lines.push(`         x ${f.type}: ${f.detail}`);
       }

package/lib/eval-three-valued.js ADDED Viewed

@@ -0,0 +1,158 @@
+'use strict';
+/**
+ * eval-three-valued.js — HC-019j three-valued test outcomes.
+ *
+ * Replaces binary pass/fail with Pass/Fail/Inconclusive for
+ * non-deterministic evaluations (LLM-as-judge, HC-019i).
+ *
+ * For deterministic checks (current graders), results are always
+ * definitive — Pass or Fail, never Inconclusive.
+ *
+ * For non-deterministic checks (future LLM-judge), the same eval
+ * is run N times and outcomes are classified statistically:
+ *   - Pass: >= passThreshold of runs passed (default 80%)
+ *   - Fail: >= failThreshold of runs failed (default 80%)
+ *   - Inconclusive: neither threshold met (needs more runs or investigation)
+ *
+ * Based on AgentAssay (ICLR 2026) three-valued probabilistic outcomes.
+ */
+/**
+ * Classify a set of run results into Pass/Fail/Inconclusive.
+ *
+ * @param {boolean[]} outcomes — array of pass/fail booleans from multiple runs
+ * @param {object} [opts]
+ * @param {number} [opts.passThreshold=0.8] — fraction of passes needed for Pass
+ * @param {number} [opts.failThreshold=0.8] — fraction of fails needed for Fail
+ * @param {number} [opts.minRuns=1] — minimum runs before classifying
+ * @returns {{ verdict: 'pass'|'fail'|'inconclusive', confidence: number, runs, passed, failed, details }}
+ */
+function classify(outcomes, opts = {}) {
+  const { passThreshold = 0.8, failThreshold = 0.8, minRuns = 1 } = opts;
+  if (!outcomes || outcomes.length === 0) {
+    return { verdict: 'inconclusive', confidence: 0, runs: 0, passed: 0, failed: 0, details: 'no runs' };
+  }
+  const runs = outcomes.length;
+  const passed = outcomes.filter(o => o === true).length;
+  const failed = runs - passed;
+  const passRate = passed / runs;
+  const failRate = failed / runs;
+  if (runs < minRuns) {
+    return {
+      verdict: 'inconclusive',
+      confidence: Math.round(passRate * 100),
+      runs, passed, failed,
+      details: `insufficient runs (${runs}/${minRuns})`,
+    };
+  }
+  // Pass-priority: if both thresholds could be met (e.g., 1 run),
+  // pass wins. This is optimistic — we assume the agent is correct
+  // unless proven otherwise with enough evidence.
+  if (passRate >= passThreshold) {
+    return {
+      verdict: 'pass',
+      confidence: Math.round(passRate * 100),
+      runs, passed, failed,
+      details: `${passed}/${runs} passed (${Math.round(passRate * 100)}% >= ${Math.round(passThreshold * 100)}% threshold)`,
+    };
+  }
+  if (failRate >= failThreshold) {
+    return {
+      verdict: 'fail',
+      confidence: Math.round(failRate * 100),
+      runs, passed, failed,
+      details: `${failed}/${runs} failed (${Math.round(failRate * 100)}% >= ${Math.round(failThreshold * 100)}% threshold)`,
+    };
+  }
+  return {
+    verdict: 'inconclusive',
+    confidence: Math.round(Math.max(passRate, failRate) * 100),
+    runs, passed, failed,
+    details: `neither threshold met: ${Math.round(passRate * 100)}% pass, ${Math.round(failRate * 100)}% fail`,
+  };
+}
+/**
+ * Compute Wilson score confidence interval for a pass rate.
+ * Used to determine if more runs would change the verdict.
+ *
+ * @param {number} passed — number of passes
+ * @param {number} total — total runs
+ * @param {number} [z=1.96] — z-score for confidence level (1.96 = 95%)
+ * @returns {{ lower: number, upper: number, center: number }}
+ */
+function wilsonInterval(passed, total, z = 1.96) {
+  if (total === 0) return { lower: 0, upper: 1, center: 0.5 };
+  const p = passed / total;
+  const denominator = 1 + z * z / total;
+  const center = (p + z * z / (2 * total)) / denominator;
+  const margin = (z * Math.sqrt((p * (1 - p) + z * z / (4 * total)) / total)) / denominator;
+  return {
+    lower: Math.max(0, Math.round((center - margin) * 1000) / 1000),
+    upper: Math.min(1, Math.round((center + margin) * 1000) / 1000),
+    center: Math.round(center * 1000) / 1000,
+  };
+}
+/**
+ * Recommend whether more runs would help resolve an inconclusive result.
+ *
+ * @param {object} result — from classify()
+ * @param {object} [opts]
+ * @param {number} [opts.maxRuns=10] — maximum recommended additional runs
+ * @returns {{ recommend: boolean, additionalRuns: number, reason: string }}
+ */
+function recommendMoreRuns(result, opts = {}) {
+  const { maxRuns = 10 } = opts;
+  if (result.verdict !== 'inconclusive') {
+    return { recommend: false, additionalRuns: 0, reason: 'verdict is definitive' };
+  }
+  if (result.runs === 0) {
+    return { recommend: true, additionalRuns: 3, reason: 'no runs yet' };
+  }
+  const interval = wilsonInterval(result.passed, result.runs);
+  const spread = interval.upper - interval.lower;
+  // If spread is wide, more runs would help narrow it
+  if (spread > 0.3 && result.runs < maxRuns) {
+    const additional = Math.min(maxRuns - result.runs, Math.ceil(result.runs * 0.5) + 2);
+    return { recommend: true, additionalRuns: additional, reason: `wide confidence interval (${interval.lower}-${interval.upper})` };
+  }
+  // If spread is narrow but still inconclusive, the result is genuinely borderline
+  return { recommend: false, additionalRuns: 0, reason: `borderline result (${interval.lower}-${interval.upper}), more runs unlikely to resolve` };
+}
+/**
+ * Wrap a deterministic eval result as a three-valued outcome.
+ * Deterministic results are always definitive (never inconclusive).
+ *
+ * @param {object} scenarioResult — from runScenario()
+ * @returns {object} — same shape with verdict + confidence added
+ */
+function wrapDeterministic(scenarioResult) {
+  let verdict;
+  if (scenarioResult.result === 'pass') verdict = 'pass';
+  else if (scenarioResult.result === 'fail') verdict = 'fail';
+  else verdict = 'error'; // error means the eval itself broke, not flaky — distinct from inconclusive
+  return {
+    ...scenarioResult,
+    verdict,
+    confidence: verdict === 'error' ? 0 : 100,
+    deterministic: true,
+  };
+}
+module.exports = { classify, wilsonInterval, recommendMoreRuns, wrapDeterministic };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@hone-ai/cli",
-  "version": "1.6.0",
+  "version": "1.7.0",
   "description": "Hone AI — Enterprise SDLC Pipeline CLI",
   "main": "hone-cli.js",
   "bin": {