npm - @adia-ai/a2ui-mcp - Versions diffs - 0.5.0 → 0.5.2 - Mend

@adia-ai/a2ui-mcp 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/CHANGELOG.md CHANGED Viewed

@@ -11,6 +11,37 @@ zettel strategies.
 _No pending changes._
+## [0.5.2] - 2026-05-13
+### Added — `eval:diff --report-substitutions` flag (§107a infra, v0.5.2)
+`packages/a2ui/mcp/scripts/eval-diff.mjs` learns a new `--report-substitutions` flag. When set, captures per-intent substitution data from free-form plans (available substitutable nodes across resolved ingredients vs substitutions the LLM actually emitted) + emits a `## Substitution coverage (§107a)` section in `diff.md` with overall ratio + 3-bucket histogram (<30% / 30-50% / ≥50%) + top-20 under-substitution table.
+Drives the F1-plateau question for v0.5.2: ratio <30% → §125 catalog-text sweep + §126 prompt iteration are high-leverage. Ratio >50% → F1 plateau is structural (catalog content or scorer artifact), revise the F1-lift plan.
+Pre-baseline measurement (run `2026-05-13T22-29-12-085Z`, `--limit 30`): **17.2% overall substitution ratio**, 17/30 intents in the <30% bucket. Decision-rule outcome: §125 + §126 are high-leverage. Companion to `@adia-ai/a2ui-compose`'s `plan` first-class graduation.
+Eval tooling only; no runtime behavior change.
+### Changed — drop dead `result._debug?.plan` fallback in `eval-diff.mjs` (§131, v0.5.2)
+`eval-diff.mjs` line 147 previously read `const plan = result.plan || result._debug?.plan || null;` — a defensive fallback to the pre-§107a soft-API path. Since `@adia-ai/a2ui-compose@0.5.2` no longer populates `_debug.plan` (§107a graduated it to first-class; §131 documents the volatility contract), the fallback is dead code. Removed.
+Eval tooling only; no runtime behavior change.
+### Added — `eval:diff --model <id>` flag for Haiku-vs-Opus A/B harness (§127 infra, v0.5.2)
+`packages/a2ui/mcp/scripts/eval-diff.mjs` adds a `--model <model-id>` flag. When set, exports `FREE_FORM_MODEL_OVERRIDE` env var before any dynamic strategy imports — the override propagates to `@adia-ai/a2ui-compose@strategies/registry.js generateFreeFormAdapter` which reads the env var at call-time (post-§127-companion change). Lets the §127 A/B harness run Opus + Haiku full-100 evals without env-var setup or process restart.
+Usage: `npm run eval:diff -- --engine free-form --model claude-opus-4-7 --report-substitutions`.
+Default unchanged: Haiku 4.5 pin (`claude-haiku-4-5-20251001`) holds when `--model` is unset. Eval tooling only; no runtime behavior change.
+## [0.5.1] - 2026-05-13
+_Lockstep ride-along (no source change)._
 ## [0.5.0] - 2026-05-13
 _Lockstep ride-along (no source change)._

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@adia-ai/a2ui-mcp",
-  "version": "0.5.0",
+  "version": "0.5.2",
   "description": "AdiaUI A2UI MCP server. Exposes the compose engine over MCP with an engine selector for monolithic + zettel strategies.",
   "type": "module",
   "bin": {

package/scripts/eval-diff.mjs CHANGED Viewed

@@ -46,9 +46,24 @@ const opt = (k) => {
 const engine = opt('engine') || 'all';
 const limit = opt('limit') ? Number(opt('limit')) : undefined;
 const domain = opt('domain');
+// §127 (v0.5.2): --model <id> override for free-form Haiku-vs-Opus A/B.
+// Sets FREE_FORM_MODEL_OVERRIDE env var BEFORE generateUI is imported so
+// the registry's module-time read picks it up. Without --model, the
+// v0.5.1 §108 Haiku pin holds (claude-haiku-4-5-20251001).
+const modelOverride = opt('model');
+if (modelOverride) {
+  process.env.FREE_FORM_MODEL_OVERRIDE = modelOverride;
+}
 // Shadow-mode semantic validator (Phase 1). Opt-in; zero effect on gating
 // when --gate-mode=structural (default).
 const semanticEnabled = args.includes('--semantic');
+// §107a (v0.5.1): substitution-coverage report. When set, capture per-intent
+// substitution data from free-form plans → emit a "Substitution coverage"
+// section in diff.md. Targets the F1-plateau question: are we under-using
+// the substitution surface (lift via §107b/c worth the cost), or is F1
+// structural (catalog text quality or scorer artifact)?
+const reportSubstitutions = args.includes('--report-substitutions');
+const substitutionStats = []; // per-intent: { intent, ingredientCount, available, applied, ratio }
 // Phase 2 (gating mode). Default depends on --semantic:
 //   without --semantic → 'structural' (no semantic work; Phase 1 baseline).
 //   with --semantic    → 'combined' since v0.1.2 (Phase 2 promotion); was
@@ -129,6 +144,39 @@ async function generateFreeFormCapture({ intent }) {
   if (semanticEnabled && Array.isArray(result.messages) && result.messages.length > 0) {
     capturedMessages.set(`free-form:${intent}`, result.messages);
   }
+  // §107a: capture substitution-coverage data when requested. Counts
+  // available substitutable nodes (Text/Button/Badge/Tag/Icon/Image/Link/Kbd)
+  // across the resolved ingredients vs substitutions the LLM emitted.
+  if (reportSubstitutions && result.strategy === 'free-form-composed') {
+    const SUBSTITUTABLE = new Set(['Text', 'Button', 'Badge', 'Tag', 'Kbd', 'Icon', 'Image', 'Link']);
+    let available = 0;
+    let applied = 0;
+    const plan = result.plan || null;
+    if (Array.isArray(result.messages) && result.messages.length > 0) {
+      const components = result.messages[0]?.components || [];
+      // Count substitutable nodes in the emitted tree (excludes the root).
+      for (const c of components) {
+        if (c.id === 'free-form-root') continue;
+        if (SUBSTITUTABLE.has(c.component)) available += 1;
+      }
+    }
+    if (plan && Array.isArray(plan.ingredients)) {
+      for (const ing of plan.ingredients) {
+        if (ing?.substitutions && typeof ing.substitutions === 'object') {
+          applied += Object.keys(ing.substitutions).length;
+        }
+      }
+    }
+    substitutionStats.push({
+      intent,
+      ingredientCount: result.usedIngredients?.length || 0,
+      available,
+      applied,
+      ratio: available > 0 ? applied / available : 0,
+    });
+  }
   return result;
 }
@@ -389,6 +437,47 @@ if (mcp && zettel) {
     const intent = (r.intent || '').slice(0, 48).replace(/\|/g, '\\|');
     md += `| ${r.id} | ${fmt(r.domain)} | ${intent} | ${fmt(r.validationScore)} | ${fmt(r.componentF1)} | ${fmt(r.strategy)} |\n`;
   }
+  // §107a (v0.5.1): Substitution coverage section. Surfaces the
+  // ratio of LLM-applied substitutions to available substitutable
+  // nodes. Drives the F1-plateau question: <30% = §107b/c high-leverage;
+  // >50% = F1 plateau is structural.
+  if (reportSubstitutions && substitutionStats.length > 0) {
+    const total = substitutionStats.length;
+    let totalAvailable = 0;
+    let totalApplied = 0;
+    let bucketLow = 0;  // ratio < 0.3
+    let bucketMid = 0;  // 0.3 ≤ ratio < 0.5
+    let bucketHigh = 0; // ≥ 0.5
+    for (const s of substitutionStats) {
+      totalAvailable += s.available;
+      totalApplied += s.applied;
+      if (s.available === 0) continue;
+      if (s.ratio < 0.3) bucketLow += 1;
+      else if (s.ratio < 0.5) bucketMid += 1;
+      else bucketHigh += 1;
+    }
+    const overallRatio = totalAvailable > 0 ? (totalApplied / totalAvailable * 100).toFixed(1) : 'n/a';
+    md += `\n## Substitution coverage (§107a)\n\n`;
+    md += `| metric | value |\n|---|---:|\n`;
+    md += `| intents measured | ${total} |\n`;
+    md += `| total substitutable nodes (across ingredients) | ${totalAvailable} |\n`;
+    md += `| substitutions applied by LLM | ${totalApplied} |\n`;
+    md += `| **overall ratio** | **${overallRatio}%** |\n`;
+    md += `| intents with ratio < 30% | ${bucketLow} |\n`;
+    md += `| intents with 30% ≤ ratio < 50% | ${bucketMid} |\n`;
+    md += `| intents with ratio ≥ 50% | ${bucketHigh} |\n\n`;
+    md += `### Per-intent substitution detail (top 20 by under-substitution)\n\n`;
+    md += `| intent | ingredients | available | applied | ratio |\n|---|---:|---:|---:|---:|\n`;
+    const sorted = substitutionStats.slice().sort((a, b) => a.ratio - b.ratio).slice(0, 20);
+    for (const s of sorted) {
+      const intent = s.intent.slice(0, 48).replace(/\|/g, '\\|');
+      md += `| ${intent} | ${s.ingredientCount} | ${s.available} | ${s.applied} | ${(s.ratio * 100).toFixed(0)}% |\n`;
+    }
+    md += `\n**Decision rule**: ratio < 30% → §107b/c high-leverage (catalog-text quality + system-prompt push). ratio > 50% → F1 plateau is structural; revise the F1 lift plan.\n`;
+  }
   await writeFile(join(outDir, 'diff.md'), md);
   console.error(`\n[eval-diff] wrote ${outDir}`);
 }