@adia-ai/a2ui-mcp 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -11,6 +11,73 @@ zettel strategies.
11
11
 
12
12
  ---
13
13
 
14
+ ## [0.1.1] - 2026-05-01
15
+
16
+ Phase 2 of [`docs/specs/semantic-validator.md`](../../../docs/specs/semantic-validator.md)
17
+ — opt-in combined-gating in `eval-diff.mjs` + new
18
+ `semantic-stats.mjs` companion script. **No breaking changes.**
19
+ Default `eval-diff` behavior unchanged — Phase 1 shadow-mode is
20
+ still the default; combined gating is opt-in via flags.
21
+
22
+ ### Added (`scripts/eval-diff.mjs` — Phase 2 gating flags)
23
+
24
+ - **`--gate-mode {structural|combined}`** — `structural` (default)
25
+ preserves Phase 1 shadow behavior: `row.pass` gates on
26
+ `validationScore` alone; semantic verdicts are annotation-only.
27
+ `combined` flips `row.pass` to gate on the combined score
28
+ (`round(0.6 × validationScore + 0.4 × semanticScore)`); preserves
29
+ the pre-flip pass as `row.passStructural`; recomputes
30
+ `runObj.passRate` + carries `runObj.passRateStructural` (baseline)
31
+ alongside; records `runObj.gateMode` + `runObj.gateThreshold`;
32
+ `diff.md` gains structural-baseline + avgSemantic + avgCombined
33
+ rows.
34
+ - **`--gate-threshold N`** — combined-mode threshold; default 70 to
35
+ match the existing structural threshold. Override per-run for
36
+ sweep-style tuning.
37
+ - **Validation gate** — combined-mode requires `--semantic`; the
38
+ script rejects the flag combination at startup so the operator
39
+ never silently ships the gating change without the scores it
40
+ needs.
41
+
42
+ ### Added (`scripts/semantic-stats.mjs` — companion stats script)
43
+
44
+ - **New** — read-only; takes two run JSON paths
45
+ (`evals/mcp/runs/<stamp>/{mcp,zettel}.json`); emits markdown to
46
+ stdout with **verdict-distribution deltas + per-intent pass-flip
47
+ diagnostics** (which intents flipped pass→fail or fail→pass
48
+ between baseline and candidate). The tooling that satisfies the
49
+ "no unexplained regressions" exit criterion of Phase 2 §
50
+ Rollout before promoting combined gating to default.
51
+
52
+ ### Procedure for promotion (deferred)
53
+
54
+ Promotion to default is deferred until two full eval-diff runs
55
+ (structural-only baseline + combined-gating candidate) have been
56
+ compared via `semantic-stats.mjs` and the regression count
57
+ justifies it. Procedure:
58
+
59
+ ```bash
60
+ # 1. Capture structural-only baseline (default behavior)
61
+ node packages/a2ui/mcp/scripts/eval-diff.mjs --engine zettel --semantic
62
+
63
+ # 2. Run the candidate (combined gating)
64
+ node packages/a2ui/mcp/scripts/eval-diff.mjs --engine zettel --semantic --gate-mode combined
65
+
66
+ # 3. Compare
67
+ node packages/a2ui/mcp/scripts/semantic-stats.mjs \
68
+ evals/mcp/runs/<baseline-stamp>/zettel.json \
69
+ evals/mcp/runs/<candidate-stamp>/zettel.json > /tmp/semantic-stats.md
70
+ ```
71
+
72
+ ### Implementation references
73
+
74
+ - [`scripts/eval-diff.mjs`](scripts/eval-diff.mjs)
75
+ - [`scripts/semantic-stats.mjs`](scripts/semantic-stats.mjs)
76
+
77
+ ### Commits
78
+
79
+ - `8415ff9e` — `feat(validator): semantic Phase 2 — opt-in combined-gating + drift cleanup`
80
+
14
81
  ## [0.1.0] - 2026-04-28
15
82
 
16
83
  **Multi-turn gen-UI tool surface (Phase A code-complete).** Adds three new
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adia-ai/a2ui-mcp",
3
- "version": "0.1.0",
3
+ "version": "0.1.1",
4
4
  "description": "AdiaUI A2UI MCP server. Exposes the compose engine over MCP with an engine selector for monolithic + zettel strategies.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -18,6 +18,9 @@
18
18
  * node packages/a2ui/mcp/scripts/eval-diff.mjs --engine zettel # fragment-graph only
19
19
  * node packages/a2ui/mcp/scripts/eval-diff.mjs --limit 20
20
20
  * node packages/a2ui/mcp/scripts/eval-diff.mjs --domain forms
21
+ * node packages/a2ui/mcp/scripts/eval-diff.mjs --semantic # Phase 1: shadow-mode semantic annotations
22
+ * node packages/a2ui/mcp/scripts/eval-diff.mjs --semantic --gate-mode combined # Phase 2: gate row.pass on combined score
23
+ * node packages/a2ui/mcp/scripts/eval-diff.mjs --semantic --gate-mode combined --gate-threshold 75
21
24
  */
22
25
  import '../../../../scripts/load-env.mjs';
23
26
 
@@ -42,8 +45,24 @@ const opt = (k) => {
42
45
  const engine = opt('engine') || 'all';
43
46
  const limit = opt('limit') ? Number(opt('limit')) : undefined;
44
47
  const domain = opt('domain');
45
- // Shadow-mode semantic validator (Phase 1). Opt-in; zero effect on gating.
48
+ // Shadow-mode semantic validator (Phase 1). Opt-in; zero effect on gating
49
+ // when --gate-mode=structural (default).
46
50
  const semanticEnabled = args.includes('--semantic');
51
+ // Phase 2 (gating mode):
52
+ // structural (default) — `row.pass` gated on validationScore alone (Phase 1 behavior)
53
+ // combined — `row.pass` gated on (0.6 * validationScore + 0.4 * semanticScore)
54
+ // Combined mode requires --semantic. Threshold defaults to 70 to match the
55
+ // existing structural threshold; override with --gate-threshold N.
56
+ const gateMode = opt('gate-mode') || 'structural';
57
+ const gateThreshold = opt('gate-threshold') ? Number(opt('gate-threshold')) : 70;
58
+ if (!['structural', 'combined'].includes(gateMode)) {
59
+ console.error(`[eval-diff] --gate-mode must be one of: structural | combined (got: ${gateMode})`);
60
+ process.exit(2);
61
+ }
62
+ if (gateMode === 'combined' && !semanticEnabled) {
63
+ console.error(`[eval-diff] --gate-mode=combined requires --semantic (semantic scores must be computed before gating on them)`);
64
+ process.exit(2);
65
+ }
47
66
 
48
67
  if (!['mcp', 'zettel', 'all'].includes(engine)) {
49
68
  console.error(`[eval-diff] --engine must be one of: mcp | zettel | all (got: ${engine})`);
@@ -144,7 +163,12 @@ async function annotateSemantic(runObj, label) {
144
163
  row.rubricVersion = v.rubricVersion;
145
164
  const structural = row.validationScore ?? 0;
146
165
  row.combinedScore = Math.round(0.6 * structural + 0.4 * v.score);
147
- // NOTE: row.pass intentionally NOT updated — shadow mode only.
166
+ // Phase 2: when gateMode === 'combined', flip row.pass to gate on the
167
+ // combined score. Preserves the structural pass for diagnostic purposes.
168
+ if (gateMode === 'combined') {
169
+ row.passStructural = row.pass;
170
+ row.pass = row.combinedScore >= gateThreshold;
171
+ }
148
172
  if (!v.error) {
149
173
  semSum += v.score;
150
174
  semN += 1;
@@ -164,7 +188,8 @@ async function annotateSemantic(runObj, label) {
164
188
  }
165
189
  runObj.semantic = {
166
190
  enabled: true,
167
- mode: 'shadow',
191
+ mode: gateMode === 'combined' ? 'gating' : 'shadow',
192
+ gateThreshold: gateMode === 'combined' ? gateThreshold : null,
168
193
  judged: semN,
169
194
  errors,
170
195
  cached,
@@ -174,14 +199,31 @@ async function annotateSemantic(runObj, label) {
174
199
  tokens: { input: tokensIn, output: tokensOut },
175
200
  rubricVersion: 'v1',
176
201
  };
177
- console.error(`[semantic:${label}] judged=${semN} avgSem=${runObj.semantic.avgSemanticScore} avgCombined=${runObj.semantic.avgCombinedScore} cached=${cached} errors=${errors} tokens=${tokensIn}+${tokensOut}`);
202
+ // Phase 2: when gateMode === 'combined', recompute pass aggregates so
203
+ // runObj.passRate / runObj.pass reflect the new gate. Capture the
204
+ // structural-only pass count alongside for diagnostic comparison.
205
+ if (gateMode === 'combined') {
206
+ const structuralPassCount = runObj.results.filter((r) => r.passStructural).length;
207
+ const combinedPassCount = runObj.results.filter((r) => r.pass).length;
208
+ runObj.passStructural = structuralPassCount;
209
+ runObj.passRateStructural = Math.round((structuralPassCount / (runObj.results.length || 1)) * 100);
210
+ runObj.pass = combinedPassCount;
211
+ runObj.passRate = Math.round((combinedPassCount / (runObj.results.length || 1)) * 100);
212
+ runObj.gateMode = 'combined';
213
+ runObj.gateThreshold = gateThreshold;
214
+ }
215
+ const modeLabel = gateMode === 'combined' ? `gating(>=${gateThreshold})` : 'shadow';
216
+ console.error(`[semantic:${label}] mode=${modeLabel} judged=${semN} avgSem=${runObj.semantic.avgSemanticScore} avgCombined=${runObj.semantic.avgCombinedScore} cached=${cached} errors=${errors} tokens=${tokensIn}+${tokensOut}`);
178
217
  }
179
218
 
180
219
  if (semanticEnabled) {
181
220
  if (!process.env.ANTHROPIC_API_KEY) {
182
221
  console.error('[eval-diff] --semantic requested but ANTHROPIC_API_KEY missing; skipping.');
183
222
  } else {
184
- console.error(`[eval-diff] running semantic validator (shadow mode)…`);
223
+ const modeNote = gateMode === 'combined'
224
+ ? `gating mode (combined threshold=${gateThreshold})`
225
+ : 'shadow mode';
226
+ console.error(`[eval-diff] running semantic validator (${modeNote})…`);
185
227
  if (mcp) await annotateSemantic(mcp, 'mcp');
186
228
  if (zettel) await annotateSemantic(zettel, 'zettel');
187
229
  }
@@ -209,7 +251,11 @@ md += `# Engine Eval ${mcp && zettel ? 'Diff' : 'Report'}\n\n`;
209
251
  md += `- Run: \`${stamp}\`\n`;
210
252
  md += `- Engine(s): ${engine}\n`;
211
253
  md += `- Intents: ${(mcp || zettel).total}${domain ? ` (domain: ${domain})` : ''}${limit ? ` (limit: ${limit})` : ''}\n`;
212
- md += `- Mode: instant\n\n`;
254
+ md += `- Mode: instant\n`;
255
+ if (semanticEnabled) {
256
+ md += `- Semantic: ${gateMode === 'combined' ? `gating (threshold=${gateThreshold})` : 'shadow'}\n`;
257
+ }
258
+ md += `\n`;
213
259
 
214
260
  md += `## Aggregates\n\n`;
215
261
  if (mcp && zettel) {
@@ -219,6 +265,11 @@ if (mcp && zettel) {
219
265
  md += `| avgScore (emitted only) | ${mcp.avgScoreWhenEmitted} | ${zettel.avgScoreWhenEmitted} |\n`;
220
266
  md += `| avgF1 (emitted only) | ${mcp.avgF1WhenEmitted} | ${zettel.avgF1WhenEmitted} |\n`;
221
267
  md += `| pass rate % | ${mcp.passRate} | ${zettel.passRate} |\n`;
268
+ if (mcp.gateMode === 'combined' || zettel.gateMode === 'combined') {
269
+ md += `| pass rate % (structural-only baseline) | ${fmt(mcp.passRateStructural)} | ${fmt(zettel.passRateStructural)} |\n`;
270
+ md += `| avgSemanticScore | ${fmt(mcp.semantic?.avgSemanticScore)} | ${fmt(zettel.semantic?.avgSemanticScore)} |\n`;
271
+ md += `| avgCombinedScore | ${fmt(mcp.semantic?.avgCombinedScore)} | ${fmt(zettel.semantic?.avgCombinedScore)} |\n`;
272
+ }
222
273
  md += `| retrieval MRR | ${fmt(mcp.retrievalMRR)} | ${fmt(zettel.retrievalMRR)} |\n\n`;
223
274
  } else {
224
275
  const e = mcp || zettel;
@@ -229,6 +280,11 @@ if (mcp && zettel) {
229
280
  md += `| avgScore (emitted only) | ${e.avgScoreWhenEmitted} |\n`;
230
281
  md += `| avgF1 (emitted only) | ${e.avgF1WhenEmitted} |\n`;
231
282
  md += `| pass rate % | ${e.passRate} |\n`;
283
+ if (e.gateMode === 'combined') {
284
+ md += `| pass rate % (structural-only baseline) | ${fmt(e.passRateStructural)} |\n`;
285
+ md += `| avgSemanticScore | ${fmt(e.semantic?.avgSemanticScore)} |\n`;
286
+ md += `| avgCombinedScore | ${fmt(e.semantic?.avgCombinedScore)} |\n`;
287
+ }
232
288
  md += `| retrieval MRR | ${fmt(e.retrievalMRR)} |\n\n`;
233
289
  }
234
290
 
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Compare two eval-diff run artifacts (JSON) and surface verdict-distribution
4
+ * + pass-rate deltas. Used to verify "no unexplained regressions" between
5
+ * baseline and a candidate gating-mode run, per Phase 2 exit criteria of
6
+ * `docs/specs/semantic-validator.md`.
7
+ *
8
+ * Usage:
9
+ * node packages/a2ui/mcp/scripts/semantic-stats.mjs <baseline.json> <candidate.json>
10
+ *
11
+ * Examples:
12
+ * # Compare structural-only baseline to combined-gating candidate
13
+ * node packages/a2ui/mcp/scripts/semantic-stats.mjs \
14
+ * evals/mcp/runs/<baseline-stamp>/zettel.json \
15
+ * evals/mcp/runs/<candidate-stamp>/zettel.json
16
+ *
17
+ * The script is read-only; it writes nothing to disk. Output is markdown
18
+ * to stdout — pipe into a file or a journal entry as needed.
19
+ */
20
+ import { readFile } from 'node:fs/promises';
21
+
22
+ const args = process.argv.slice(2);
23
+ if (args.length !== 2) {
24
+ console.error('Usage: semantic-stats.mjs <baseline.json> <candidate.json>');
25
+ process.exit(2);
26
+ }
27
+
28
+ const [basePath, candPath] = args;
29
+ const base = JSON.parse(await readFile(basePath, 'utf8'));
30
+ const cand = JSON.parse(await readFile(candPath, 'utf8'));
31
+
32
+ const fmt = (v) => v == null ? '—' : String(v);
33
+ const delta = (a, b) => {
34
+ if (a == null || b == null) return '—';
35
+ const d = Math.round((b - a) * 10) / 10;
36
+ return d > 0 ? `+${d}` : String(d);
37
+ };
38
+
39
+ console.log(`# Semantic stats — baseline vs candidate\n`);
40
+ console.log(`- Baseline: \`${basePath}\` (label=${fmt(base.label)})`);
41
+ console.log(`- Candidate: \`${candPath}\` (label=${fmt(cand.label)})\n`);
42
+
43
+ console.log(`## Aggregate deltas\n`);
44
+ console.log(`| metric | baseline | candidate | delta |`);
45
+ console.log(`|---|---:|---:|---:|`);
46
+ console.log(`| total | ${base.total} | ${cand.total} | ${delta(base.total, cand.total)} |`);
47
+ console.log(`| coverage % | ${base.coverage} | ${cand.coverage} | ${delta(base.coverage, cand.coverage)} |`);
48
+ console.log(`| emitted | ${base.emitted}/${base.total} | ${cand.emitted}/${cand.total} | ${delta(base.emitted, cand.emitted)} |`);
49
+ console.log(`| avgScoreWhenEmitted | ${base.avgScoreWhenEmitted} | ${cand.avgScoreWhenEmitted} | ${delta(base.avgScoreWhenEmitted, cand.avgScoreWhenEmitted)} |`);
50
+ console.log(`| avgF1WhenEmitted | ${base.avgF1WhenEmitted} | ${cand.avgF1WhenEmitted} | ${delta(base.avgF1WhenEmitted, cand.avgF1WhenEmitted)} |`);
51
+ console.log(`| pass rate % | ${base.passRate} | ${cand.passRate} | ${delta(base.passRate, cand.passRate)} |`);
52
+
53
+ if (cand.passRateStructural != null) {
54
+ console.log(`| pass rate % (structural-only on candidate) | ${base.passRate} | ${cand.passRateStructural} | ${delta(base.passRate, cand.passRateStructural)} |`);
55
+ }
56
+
57
+ if (base.semantic || cand.semantic) {
58
+ console.log(`| avgSemanticScore | ${fmt(base.semantic?.avgSemanticScore)} | ${fmt(cand.semantic?.avgSemanticScore)} | ${delta(base.semantic?.avgSemanticScore, cand.semantic?.avgSemanticScore)} |`);
59
+ console.log(`| avgCombinedScore | ${fmt(base.semantic?.avgCombinedScore)} | ${fmt(cand.semantic?.avgCombinedScore)} | ${delta(base.semantic?.avgCombinedScore, cand.semantic?.avgCombinedScore)} |`);
60
+ }
61
+ console.log();
62
+
63
+ // Verdict distribution (when --semantic was used in either run)
64
+ function verdicts(run) {
65
+ return run.semantic?.verdictBreakdown || null;
66
+ }
67
+ const bv = verdicts(base);
68
+ const cv = verdicts(cand);
69
+ if (bv || cv) {
70
+ console.log(`## Verdict distribution\n`);
71
+ const allKeys = new Set([...(bv ? Object.keys(bv) : []), ...(cv ? Object.keys(cv) : [])]);
72
+ console.log(`| verdict | baseline | candidate | delta |`);
73
+ console.log(`|---|---:|---:|---:|`);
74
+ for (const k of [...allKeys].sort()) {
75
+ console.log(`| ${k} | ${fmt(bv?.[k] ?? 0)} | ${fmt(cv?.[k] ?? 0)} | ${delta(bv?.[k] ?? 0, cv?.[k] ?? 0)} |`);
76
+ }
77
+ console.log();
78
+ }
79
+
80
+ // Per-row pass-flip surface — which intents flipped pass/fail between the two runs?
81
+ function indexById(run) {
82
+ return new Map((run.results || []).map((r) => [r.id, r]));
83
+ }
84
+ const baseById = indexById(base);
85
+ const candById = indexById(cand);
86
+ const flips = { pass_to_fail: [], fail_to_pass: [] };
87
+ for (const [id, b] of baseById) {
88
+ const c = candById.get(id);
89
+ if (!c) continue;
90
+ if (b.pass && !c.pass) flips.pass_to_fail.push({ id, intent: b.intent, baseScore: b.validationScore, candCombined: c.combinedScore, candSemantic: c.semanticScore, candVerdict: c.semanticVerdict });
91
+ else if (!b.pass && c.pass) flips.fail_to_pass.push({ id, intent: b.intent, baseScore: b.validationScore, candCombined: c.combinedScore, candSemantic: c.semanticScore, candVerdict: c.semanticVerdict });
92
+ }
93
+
94
+ console.log(`## Pass-flip diagnostics\n`);
95
+ console.log(`- pass → fail (regressions to investigate): **${flips.pass_to_fail.length}**`);
96
+ console.log(`- fail → pass (improvements): **${flips.fail_to_pass.length}**\n`);
97
+
98
+ function flipTable(rows, header) {
99
+ if (!rows.length) return;
100
+ console.log(`### ${header}\n`);
101
+ console.log(`| intent | base validationScore | cand combinedScore | cand semanticScore | cand verdict |`);
102
+ console.log(`|---|---:|---:|---:|---|`);
103
+ for (const r of rows.slice(0, 20)) {
104
+ console.log(`| ${r.intent} | ${fmt(r.baseScore)} | ${fmt(r.candCombined)} | ${fmt(r.candSemantic)} | ${fmt(r.candVerdict)} |`);
105
+ }
106
+ if (rows.length > 20) console.log(`\n_(${rows.length - 20} more rows omitted)_`);
107
+ console.log();
108
+ }
109
+ flipTable(flips.pass_to_fail, 'Regressions (pass → fail)');
110
+ flipTable(flips.fail_to_pass, 'Improvements (fail → pass)');
111
+
112
+ console.log(`---\n`);
113
+ console.log(`_Generated by \`packages/a2ui/mcp/scripts/semantic-stats.mjs\`_`);