npm - @adia-ai/a2ui-mcp - Versions diffs - 0.1.0 → 0.1.1 - Mend

@adia-ai/a2ui-mcp 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/CHANGELOG.md +67 -0
package/package.json +1 -1
package/scripts/eval-diff.mjs +62 -6
package/scripts/semantic-stats.mjs +113 -0

package/CHANGELOG.md CHANGED Viewed

@@ -11,6 +11,73 @@ zettel strategies.
 ---
+## [0.1.1] - 2026-05-01
+Phase 2 of [`docs/specs/semantic-validator.md`](../../../docs/specs/semantic-validator.md)
+— opt-in combined-gating in `eval-diff.mjs` + new
+`semantic-stats.mjs` companion script. **No breaking changes.**
+Default `eval-diff` behavior unchanged — Phase 1 shadow-mode is
+still the default; combined gating is opt-in via flags.
+### Added (`scripts/eval-diff.mjs` — Phase 2 gating flags)
+- **`--gate-mode {structural|combined}`** — `structural` (default)
+  preserves Phase 1 shadow behavior: `row.pass` gates on
+  `validationScore` alone; semantic verdicts are annotation-only.
+  `combined` flips `row.pass` to gate on the combined score
+  (`round(0.6 × validationScore + 0.4 × semanticScore)`); preserves
+  the pre-flip pass as `row.passStructural`; recomputes
+  `runObj.passRate` + carries `runObj.passRateStructural` (baseline)
+  alongside; records `runObj.gateMode` + `runObj.gateThreshold`;
+  `diff.md` gains structural-baseline + avgSemantic + avgCombined
+  rows.
+- **`--gate-threshold N`** — combined-mode threshold; default 70 to
+  match the existing structural threshold. Override per-run for
+  sweep-style tuning.
+- **Validation gate** — combined-mode requires `--semantic`; the
+  script rejects the flag combination at startup so the operator
+  never silently ships the gating change without the scores it
+  needs.
+### Added (`scripts/semantic-stats.mjs` — companion stats script)
+- **New** — read-only; takes two run JSON paths
+  (`evals/mcp/runs/<stamp>/{mcp,zettel}.json`); emits markdown to
+  stdout with **verdict-distribution deltas + per-intent pass-flip
+  diagnostics** (which intents flipped pass→fail or fail→pass
+  between baseline and candidate). The tooling that satisfies the
+  "no unexplained regressions" exit criterion of Phase 2 §
+  Rollout before promoting combined gating to default.
+### Procedure for promotion (deferred)
+Promotion to default is deferred until two full eval-diff runs
+(structural-only baseline + combined-gating candidate) have been
+compared via `semantic-stats.mjs` and the regression count
+justifies it. Procedure:
+```bash
+# 1. Capture structural-only baseline (default behavior)
+node packages/a2ui/mcp/scripts/eval-diff.mjs --engine zettel --semantic
+# 2. Run the candidate (combined gating)
+node packages/a2ui/mcp/scripts/eval-diff.mjs --engine zettel --semantic --gate-mode combined
+# 3. Compare
+node packages/a2ui/mcp/scripts/semantic-stats.mjs \
+  evals/mcp/runs/<baseline-stamp>/zettel.json \
+  evals/mcp/runs/<candidate-stamp>/zettel.json > /tmp/semantic-stats.md
+```
+### Implementation references
+- [`scripts/eval-diff.mjs`](scripts/eval-diff.mjs)
+- [`scripts/semantic-stats.mjs`](scripts/semantic-stats.mjs)
+### Commits
+- `8415ff9e` — `feat(validator): semantic Phase 2 — opt-in combined-gating + drift cleanup`
 ## [0.1.0] - 2026-04-28
 **Multi-turn gen-UI tool surface (Phase A code-complete).** Adds three new

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@adia-ai/a2ui-mcp",
-  "version": "0.1.0",
+  "version": "0.1.1",
   "description": "AdiaUI A2UI MCP server. Exposes the compose engine over MCP with an engine selector for monolithic + zettel strategies.",
   "type": "module",
   "bin": {

package/scripts/eval-diff.mjs CHANGED Viewed

@@ -18,6 +18,9 @@
  *   node packages/a2ui/mcp/scripts/eval-diff.mjs --engine zettel  # fragment-graph only
  *   node packages/a2ui/mcp/scripts/eval-diff.mjs --limit 20
  *   node packages/a2ui/mcp/scripts/eval-diff.mjs --domain forms
+ *   node packages/a2ui/mcp/scripts/eval-diff.mjs --semantic       # Phase 1: shadow-mode semantic annotations
+ *   node packages/a2ui/mcp/scripts/eval-diff.mjs --semantic --gate-mode combined  # Phase 2: gate row.pass on combined score
+ *   node packages/a2ui/mcp/scripts/eval-diff.mjs --semantic --gate-mode combined --gate-threshold 75
  */
 import '../../../../scripts/load-env.mjs';
@@ -42,8 +45,24 @@ const opt = (k) => {
 const engine = opt('engine') || 'all';
 const limit = opt('limit') ? Number(opt('limit')) : undefined;
 const domain = opt('domain');
-// Shadow-mode semantic validator (Phase 1). Opt-in; zero effect on gating.
+// Shadow-mode semantic validator (Phase 1). Opt-in; zero effect on gating
+// when --gate-mode=structural (default).
 const semanticEnabled = args.includes('--semantic');
+// Phase 2 (gating mode):
+//   structural (default) — `row.pass` gated on validationScore alone (Phase 1 behavior)
+//   combined             — `row.pass` gated on (0.6 * validationScore + 0.4 * semanticScore)
+// Combined mode requires --semantic. Threshold defaults to 70 to match the
+// existing structural threshold; override with --gate-threshold N.
+const gateMode = opt('gate-mode') || 'structural';
+const gateThreshold = opt('gate-threshold') ? Number(opt('gate-threshold')) : 70;
+if (!['structural', 'combined'].includes(gateMode)) {
+  console.error(`[eval-diff] --gate-mode must be one of: structural | combined  (got: ${gateMode})`);
+  process.exit(2);
+}
+if (gateMode === 'combined' && !semanticEnabled) {
+  console.error(`[eval-diff] --gate-mode=combined requires --semantic (semantic scores must be computed before gating on them)`);
+  process.exit(2);
+}
 if (!['mcp', 'zettel', 'all'].includes(engine)) {
   console.error(`[eval-diff] --engine must be one of: mcp | zettel | all  (got: ${engine})`);
@@ -144,7 +163,12 @@ async function annotateSemantic(runObj, label) {
       row.rubricVersion = v.rubricVersion;
       const structural = row.validationScore ?? 0;
       row.combinedScore = Math.round(0.6 * structural + 0.4 * v.score);
-      // NOTE: row.pass intentionally NOT updated — shadow mode only.
+      // Phase 2: when gateMode === 'combined', flip row.pass to gate on the
+      // combined score. Preserves the structural pass for diagnostic purposes.
+      if (gateMode === 'combined') {
+        row.passStructural = row.pass;
+        row.pass = row.combinedScore >= gateThreshold;
+      }
       if (!v.error) {
         semSum += v.score;
         semN += 1;
@@ -164,7 +188,8 @@ async function annotateSemantic(runObj, label) {
   }
   runObj.semantic = {
     enabled: true,
-    mode: 'shadow',
+    mode: gateMode === 'combined' ? 'gating' : 'shadow',
+    gateThreshold: gateMode === 'combined' ? gateThreshold : null,
     judged: semN,
     errors,
     cached,
@@ -174,14 +199,31 @@ async function annotateSemantic(runObj, label) {
     tokens: { input: tokensIn, output: tokensOut },
     rubricVersion: 'v1',
   };
-  console.error(`[semantic:${label}] judged=${semN} avgSem=${runObj.semantic.avgSemanticScore} avgCombined=${runObj.semantic.avgCombinedScore} cached=${cached} errors=${errors} tokens=${tokensIn}+${tokensOut}`);
+  // Phase 2: when gateMode === 'combined', recompute pass aggregates so
+  // runObj.passRate / runObj.pass reflect the new gate. Capture the
+  // structural-only pass count alongside for diagnostic comparison.
+  if (gateMode === 'combined') {
+    const structuralPassCount = runObj.results.filter((r) => r.passStructural).length;
+    const combinedPassCount = runObj.results.filter((r) => r.pass).length;
+    runObj.passStructural = structuralPassCount;
+    runObj.passRateStructural = Math.round((structuralPassCount / (runObj.results.length || 1)) * 100);
+    runObj.pass = combinedPassCount;
+    runObj.passRate = Math.round((combinedPassCount / (runObj.results.length || 1)) * 100);
+    runObj.gateMode = 'combined';
+    runObj.gateThreshold = gateThreshold;
+  }
+  const modeLabel = gateMode === 'combined' ? `gating(>=${gateThreshold})` : 'shadow';
+  console.error(`[semantic:${label}] mode=${modeLabel} judged=${semN} avgSem=${runObj.semantic.avgSemanticScore} avgCombined=${runObj.semantic.avgCombinedScore} cached=${cached} errors=${errors} tokens=${tokensIn}+${tokensOut}`);
 }
 if (semanticEnabled) {
   if (!process.env.ANTHROPIC_API_KEY) {
     console.error('[eval-diff] --semantic requested but ANTHROPIC_API_KEY missing; skipping.');
   } else {
-    console.error(`[eval-diff] running semantic validator (shadow mode)…`);
+    const modeNote = gateMode === 'combined'
+      ? `gating mode (combined threshold=${gateThreshold})`
+      : 'shadow mode';
+    console.error(`[eval-diff] running semantic validator (${modeNote})…`);
     if (mcp) await annotateSemantic(mcp, 'mcp');
     if (zettel) await annotateSemantic(zettel, 'zettel');
   }
@@ -209,7 +251,11 @@ md += `# Engine Eval ${mcp && zettel ? 'Diff' : 'Report'}\n\n`;
 md += `- Run: \`${stamp}\`\n`;
 md += `- Engine(s): ${engine}\n`;
 md += `- Intents: ${(mcp || zettel).total}${domain ? ` (domain: ${domain})` : ''}${limit ? ` (limit: ${limit})` : ''}\n`;
-md += `- Mode: instant\n\n`;
+md += `- Mode: instant\n`;
+if (semanticEnabled) {
+  md += `- Semantic: ${gateMode === 'combined' ? `gating (threshold=${gateThreshold})` : 'shadow'}\n`;
+}
+md += `\n`;
 md += `## Aggregates\n\n`;
 if (mcp && zettel) {
@@ -219,6 +265,11 @@ if (mcp && zettel) {
   md += `| avgScore (emitted only) | ${mcp.avgScoreWhenEmitted} | ${zettel.avgScoreWhenEmitted} |\n`;
   md += `| avgF1 (emitted only) | ${mcp.avgF1WhenEmitted} | ${zettel.avgF1WhenEmitted} |\n`;
   md += `| pass rate % | ${mcp.passRate} | ${zettel.passRate} |\n`;
+  if (mcp.gateMode === 'combined' || zettel.gateMode === 'combined') {
+    md += `| pass rate % (structural-only baseline) | ${fmt(mcp.passRateStructural)} | ${fmt(zettel.passRateStructural)} |\n`;
+    md += `| avgSemanticScore | ${fmt(mcp.semantic?.avgSemanticScore)} | ${fmt(zettel.semantic?.avgSemanticScore)} |\n`;
+    md += `| avgCombinedScore | ${fmt(mcp.semantic?.avgCombinedScore)} | ${fmt(zettel.semantic?.avgCombinedScore)} |\n`;
+  }
   md += `| retrieval MRR | ${fmt(mcp.retrievalMRR)} | ${fmt(zettel.retrievalMRR)} |\n\n`;
 } else {
   const e = mcp || zettel;
@@ -229,6 +280,11 @@ if (mcp && zettel) {
   md += `| avgScore (emitted only) | ${e.avgScoreWhenEmitted} |\n`;
   md += `| avgF1 (emitted only) | ${e.avgF1WhenEmitted} |\n`;
   md += `| pass rate % | ${e.passRate} |\n`;
+  if (e.gateMode === 'combined') {
+    md += `| pass rate % (structural-only baseline) | ${fmt(e.passRateStructural)} |\n`;
+    md += `| avgSemanticScore | ${fmt(e.semantic?.avgSemanticScore)} |\n`;
+    md += `| avgCombinedScore | ${fmt(e.semantic?.avgCombinedScore)} |\n`;
+  }
   md += `| retrieval MRR | ${fmt(e.retrievalMRR)} |\n\n`;
 }

package/scripts/semantic-stats.mjs ADDED Viewed

@@ -0,0 +1,113 @@
+#!/usr/bin/env node
+/**
+ * Compare two eval-diff run artifacts (JSON) and surface verdict-distribution
+ * + pass-rate deltas. Used to verify "no unexplained regressions" between
+ * baseline and a candidate gating-mode run, per Phase 2 exit criteria of
+ * `docs/specs/semantic-validator.md`.
+ *
+ * Usage:
+ *   node packages/a2ui/mcp/scripts/semantic-stats.mjs <baseline.json> <candidate.json>
+ *
+ * Examples:
+ *   # Compare structural-only baseline to combined-gating candidate
+ *   node packages/a2ui/mcp/scripts/semantic-stats.mjs \
+ *     evals/mcp/runs/<baseline-stamp>/zettel.json \
+ *     evals/mcp/runs/<candidate-stamp>/zettel.json
+ *
+ * The script is read-only; it writes nothing to disk. Output is markdown
+ * to stdout — pipe into a file or a journal entry as needed.
+ */
+import { readFile } from 'node:fs/promises';
+const args = process.argv.slice(2);
+if (args.length !== 2) {
+  console.error('Usage: semantic-stats.mjs <baseline.json> <candidate.json>');
+  process.exit(2);
+}
+const [basePath, candPath] = args;
+const base = JSON.parse(await readFile(basePath, 'utf8'));
+const cand = JSON.parse(await readFile(candPath, 'utf8'));
+const fmt = (v) => v == null ? '—' : String(v);
+const delta = (a, b) => {
+  if (a == null || b == null) return '—';
+  const d = Math.round((b - a) * 10) / 10;
+  return d > 0 ? `+${d}` : String(d);
+};
+console.log(`# Semantic stats — baseline vs candidate\n`);
+console.log(`- Baseline:  \`${basePath}\` (label=${fmt(base.label)})`);
+console.log(`- Candidate: \`${candPath}\` (label=${fmt(cand.label)})\n`);
+console.log(`## Aggregate deltas\n`);
+console.log(`| metric | baseline | candidate | delta |`);
+console.log(`|---|---:|---:|---:|`);
+console.log(`| total | ${base.total} | ${cand.total} | ${delta(base.total, cand.total)} |`);
+console.log(`| coverage % | ${base.coverage} | ${cand.coverage} | ${delta(base.coverage, cand.coverage)} |`);
+console.log(`| emitted | ${base.emitted}/${base.total} | ${cand.emitted}/${cand.total} | ${delta(base.emitted, cand.emitted)} |`);
+console.log(`| avgScoreWhenEmitted | ${base.avgScoreWhenEmitted} | ${cand.avgScoreWhenEmitted} | ${delta(base.avgScoreWhenEmitted, cand.avgScoreWhenEmitted)} |`);
+console.log(`| avgF1WhenEmitted | ${base.avgF1WhenEmitted} | ${cand.avgF1WhenEmitted} | ${delta(base.avgF1WhenEmitted, cand.avgF1WhenEmitted)} |`);
+console.log(`| pass rate % | ${base.passRate} | ${cand.passRate} | ${delta(base.passRate, cand.passRate)} |`);
+if (cand.passRateStructural != null) {
+  console.log(`| pass rate % (structural-only on candidate) | ${base.passRate} | ${cand.passRateStructural} | ${delta(base.passRate, cand.passRateStructural)} |`);
+}
+if (base.semantic || cand.semantic) {
+  console.log(`| avgSemanticScore | ${fmt(base.semantic?.avgSemanticScore)} | ${fmt(cand.semantic?.avgSemanticScore)} | ${delta(base.semantic?.avgSemanticScore, cand.semantic?.avgSemanticScore)} |`);
+  console.log(`| avgCombinedScore | ${fmt(base.semantic?.avgCombinedScore)} | ${fmt(cand.semantic?.avgCombinedScore)} | ${delta(base.semantic?.avgCombinedScore, cand.semantic?.avgCombinedScore)} |`);
+}
+console.log();
+// Verdict distribution (when --semantic was used in either run)
+function verdicts(run) {
+  return run.semantic?.verdictBreakdown || null;
+}
+const bv = verdicts(base);
+const cv = verdicts(cand);
+if (bv || cv) {
+  console.log(`## Verdict distribution\n`);
+  const allKeys = new Set([...(bv ? Object.keys(bv) : []), ...(cv ? Object.keys(cv) : [])]);
+  console.log(`| verdict | baseline | candidate | delta |`);
+  console.log(`|---|---:|---:|---:|`);
+  for (const k of [...allKeys].sort()) {
+    console.log(`| ${k} | ${fmt(bv?.[k] ?? 0)} | ${fmt(cv?.[k] ?? 0)} | ${delta(bv?.[k] ?? 0, cv?.[k] ?? 0)} |`);
+  }
+  console.log();
+}
+// Per-row pass-flip surface — which intents flipped pass/fail between the two runs?
+function indexById(run) {
+  return new Map((run.results || []).map((r) => [r.id, r]));
+}
+const baseById = indexById(base);
+const candById = indexById(cand);
+const flips = { pass_to_fail: [], fail_to_pass: [] };
+for (const [id, b] of baseById) {
+  const c = candById.get(id);
+  if (!c) continue;
+  if (b.pass && !c.pass) flips.pass_to_fail.push({ id, intent: b.intent, baseScore: b.validationScore, candCombined: c.combinedScore, candSemantic: c.semanticScore, candVerdict: c.semanticVerdict });
+  else if (!b.pass && c.pass) flips.fail_to_pass.push({ id, intent: b.intent, baseScore: b.validationScore, candCombined: c.combinedScore, candSemantic: c.semanticScore, candVerdict: c.semanticVerdict });
+}
+console.log(`## Pass-flip diagnostics\n`);
+console.log(`- pass → fail (regressions to investigate): **${flips.pass_to_fail.length}**`);
+console.log(`- fail → pass (improvements): **${flips.fail_to_pass.length}**\n`);
+function flipTable(rows, header) {
+  if (!rows.length) return;
+  console.log(`### ${header}\n`);
+  console.log(`| intent | base validationScore | cand combinedScore | cand semanticScore | cand verdict |`);
+  console.log(`|---|---:|---:|---:|---|`);
+  for (const r of rows.slice(0, 20)) {
+    console.log(`| ${r.intent} | ${fmt(r.baseScore)} | ${fmt(r.candCombined)} | ${fmt(r.candSemantic)} | ${fmt(r.candVerdict)} |`);
+  }
+  if (rows.length > 20) console.log(`\n_(${rows.length - 20} more rows omitted)_`);
+  console.log();
+}
+flipTable(flips.pass_to_fail, 'Regressions (pass → fail)');
+flipTable(flips.fail_to_pass, 'Improvements (fail → pass)');
+console.log(`---\n`);
+console.log(`_Generated by \`packages/a2ui/mcp/scripts/semantic-stats.mjs\`_`);