npm - @adia-ai/a2ui-mcp - Versions diffs - 0.4.6 → 0.4.8 - Mend

@adia-ai/a2ui-mcp 0.4.6 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/CHANGELOG.md +18 -0
package/package.json +1 -1
package/scripts/eval-diff.mjs +35 -8
package/scripts/smoke-engine-registry.mjs +18 -2
package/scripts/smoke-register-engine.mjs +3 -2
package/scripts/test-a2ui.mjs +53 -32

package/CHANGELOG.md CHANGED Viewed

@@ -11,6 +11,24 @@ zettel strategies.
 _No pending changes._
+## [0.4.8] - 2026-05-12
+### Changed — eval threshold rebaseline + smoke-script alignment (§87, v0.4.8)
+Companion to `@adia-ai/a2ui-compose@[Unreleased]` §87 zettel `ensureBooted()` race fix.
+- **`mcp/scripts/eval-diff.mjs`** — thresholds rebaselined to the honest post-§72 floor (~5%; was holding stale v0.4.6 baseline). Eval coverage gate now reflects the chunks-only retrieval shape after the patterns retirement.
+- **`mcp/scripts/smoke-engine-registry.mjs`** — retrieval-quality probes updated to match the post-§72 + §88 retrieval surface (composition-match strategy on canonical intents).
+- **`mcp/scripts/smoke-register-engine.mjs`** — engine-registry coverage adjusted; still 11/11.
+See root [CHANGELOG.md `[Unreleased]`](../../../CHANGELOG.md) for the cross-cutting arc.
+## [0.4.7] - 2026-05-12
+### Changed — smoke + test scripts aligned to post-§72 retrieval surface
+`scripts/smoke-engine-registry.mjs` retrieval probe set + `scripts/test-a2ui.mjs` (composition-count threshold + spot-checks + intent-gate keyword surface) now exercise the harvested-chunks substrate that survives §72's `corpus/patterns/` + `corpus/compositions/` retirement. Probe for "pricing tiers" dropped (no pricing surface in shipped `/site/`); replaced with "admin dashboard with kpi cards" matching `dashboard-admin-page`. Spot-check names updated to real chunk names (`auth-signin-card-password`, `auth-signup-entry`, `dashboard-admin-page`, `settings-admin-page`). No tool / API change — internal scripts only.
 ## [0.4.6] - 2026-05-12
 ### Changed — patterns surface retired (§64 step 5, 2026-05-12)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@adia-ai/a2ui-mcp",
-  "version": "0.4.6",
+  "version": "0.4.8",
   "description": "AdiaUI A2UI MCP server. Exposes the compose engine over MCP with an engine selector for monolithic + zettel strategies.",
   "type": "module",
   "bin": {

package/scripts/eval-diff.mjs CHANGED Viewed

@@ -71,14 +71,15 @@ if (gateMode === 'combined' && !semanticEnabled) {
   process.exit(2);
 }
-if (!['mcp', 'zettel', 'chunk-zettel', 'all'].includes(engine)) {
-  console.error(`[eval-diff] --engine must be one of: mcp | zettel | chunk-zettel | all  (got: ${engine})`);
+if (!['mcp', 'zettel', 'chunk-zettel', 'free-form', 'all'].includes(engine)) {
+  console.error(`[eval-diff] --engine must be one of: mcp | zettel | chunk-zettel | free-form | all  (got: ${engine})`);
   process.exit(2);
 }
 const runMcp = engine === 'mcp' || engine === 'all';
 const runZettel = engine === 'zettel' || engine === 'all';
 const runChunkZettel = engine === 'chunk-zettel' || engine === 'all';
+const runFreeForm = engine === 'free-form' || engine === 'all';
 // ── MCP adapter: use the top-level patternName exposed by generateInstant ──
 // Shadow-mode capture: when --semantic is set, remember the emitted messages
@@ -119,10 +120,23 @@ async function generateChunkZettelCapture({ intent, mode }) {
   return result;
 }
+async function generateFreeFormCapture({ intent }) {
+  // Free-form requires an LLM adapter. generateUI auto-resolves one via
+  // createAdapter — same env-resolution path as monolithic-pro. Without
+  // a key set, the strategy returns `free-form-no-llm` + empty messages
+  // (coverage 0%); that's the honest signal.
+  const result = await generateUI({ intent, engine: 'free-form' });
+  if (semanticEnabled && Array.isArray(result.messages) && result.messages.length > 0) {
+    capturedMessages.set(`free-form:${intent}`, result.messages);
+  }
+  return result;
+}
 // ── Run ──
 let mcp = null;
 let zettel = null;
 let chunkZettel = null;
+let freeForm = null;
 if (runMcp) {
   console.error(`[eval-diff] running mcp (monolithic) harness…`);
@@ -160,6 +174,18 @@ if (runChunkZettel) {
   console.error(`  coverage=${chunkZettel.coverage}% emitted=${chunkZettel.emitted}/${chunkZettel.total} avgScore=${chunkZettel.avgScoreWhenEmitted}`);
 }
+if (runFreeForm) {
+  console.error(`[eval-diff] running free-form (LLM-driven chunk-vocabulary composer) harness…`);
+  freeForm = await runHarnessV2({
+    generate: generateFreeFormCapture,
+    domain,
+    limit,
+    mode: 'instant',
+    label: 'free-form',
+  });
+  console.error(`  coverage=${freeForm.coverage}% emitted=${freeForm.emitted}/${freeForm.total} avgScore=${freeForm.avgScoreWhenEmitted}`);
+}
 // ── Shadow-mode semantic validation (Phase 1) ──
 // Opt-in via --semantic. Annotates per-intent rows + aggregates with
 // semanticScore/verdict/combinedScore. DOES NOT affect row.pass, passRate,
@@ -266,6 +292,7 @@ await mkdir(outDir, { recursive: true });
 if (mcp) await writeFile(join(outDir, 'mcp.json'), JSON.stringify(mcp, null, 2));
 if (zettel) await writeFile(join(outDir, 'zettel.json'), JSON.stringify(zettel, null, 2));
 if (chunkZettel) await writeFile(join(outDir, 'chunk-zettel.json'), JSON.stringify(chunkZettel, null, 2));
+if (freeForm) await writeFile(join(outDir, 'free-form.json'), JSON.stringify(freeForm, null, 2));
 // ── Build diff.md ──
 function fmt(v) { return v == null ? '—' : String(v); }
@@ -278,10 +305,10 @@ function winner(a, b) {
 }
 let md = '';
-md += `# Engine Eval ${[mcp, zettel, chunkZettel].filter(Boolean).length > 1 ? 'Diff' : 'Report'}\n\n`;
+md += `# Engine Eval ${[mcp, zettel, chunkZettel, freeForm].filter(Boolean).length > 1 ? 'Diff' : 'Report'}\n\n`;
 md += `- Run: \`${stamp}\`\n`;
 md += `- Engine(s): ${engine}\n`;
-md += `- Intents: ${(mcp || zettel || chunkZettel).total}${domain ? ` (domain: ${domain})` : ''}${limit ? ` (limit: ${limit})` : ''}\n`;
+md += `- Intents: ${(mcp || zettel || chunkZettel || freeForm).total}${domain ? ` (domain: ${domain})` : ''}${limit ? ` (limit: ${limit})` : ''}\n`;
 md += `- Mode: instant\n`;
 if (semanticEnabled) {
   md += `- Semantic: ${gateMode === 'combined' ? `gating (threshold=${gateThreshold})` : 'shadow'}\n`;
@@ -303,8 +330,8 @@ if (mcp && zettel) {
   }
   md += `| retrieval MRR | ${fmt(mcp.retrievalMRR)} | ${fmt(zettel.retrievalMRR)} |\n\n`;
 } else {
-  const e = mcp || zettel || chunkZettel;
-  const label = mcp ? 'mcp' : zettel ? 'zettel' : 'chunk-zettel';
+  const e = mcp || zettel || chunkZettel || freeForm;
+  const label = mcp ? 'mcp' : zettel ? 'zettel' : chunkZettel ? 'chunk-zettel' : 'free-form';
   md += `| metric | ${label} |\n|---|---:|\n`;
   md += `| coverage % | ${e.coverage} |\n`;
   md += `| emitted | ${e.emitted}/${e.total} |\n`;
@@ -352,8 +379,8 @@ if (mcp && zettel) {
   console.error(`  ties:        ${counts.tie || 0}`);
   console.error(`  both missed: ${counts['both-miss'] || 0}`);
 } else {
-  const e = mcp || zettel || chunkZettel;
-  const label = mcp ? 'mcp' : zettel ? 'zettel' : 'chunk-zettel';
+  const e = mcp || zettel || chunkZettel || freeForm;
+  const label = mcp ? 'mcp' : zettel ? 'zettel' : chunkZettel ? 'chunk-zettel' : 'free-form';
   md += `## Strategy breakdown\n\n`;
   md += `**${label}**: ` + Object.entries(e.strategyBreakdown).map(([k, v]) => `${k}=${v}`).join(', ') + `\n\n`;
   md += `## Per-intent\n\n`;

package/scripts/smoke-engine-registry.mjs CHANGED Viewed

@@ -17,9 +17,11 @@ console.log('[smoke] engines registered:', listEngines().join(', '));
 const monoInstant = pick({ engine: 'monolithic', mode: 'instant' });
 const monoPro = pick({ engine: 'monolithic', mode: 'pro' });
 const zettel = pick({ engine: 'zettel' });
+const freeForm = pick({ engine: 'free-form' });
 console.log('[smoke] pick monolithic/instant:', monoInstant === ENGINES['monolithic-instant'] ? 'ok' : 'FAIL');
 console.log('[smoke] pick monolithic/pro:    ', monoPro === ENGINES['monolithic-pro'] ? 'ok' : 'FAIL');
 console.log('[smoke] pick zettel:            ', zettel === ENGINES.zettel ? 'ok' : 'FAIL');
+console.log('[smoke] pick free-form:         ', freeForm === ENGINES['free-form'] ? 'ok' : 'FAIL');
 console.log('[smoke] pick unknown → fallback:', pick({ engine: 'xxx', mode: 'xxx' }) === ENGINES['monolithic-instant'] ? 'ok' : 'FAIL');
 const intent = 'login form with email and password';
@@ -34,20 +36,34 @@ const t2 = Date.now();
 const r2 = await generateUI({ intent, engine: 'zettel' });
 console.log(`[zettel]       ${Date.now() - t2}ms  msgs=${r2.messages?.length}  valid=${r2.validation?.valid}  score=${r2.validation?.score}  strategy=${r2.strategy}  engine=${r2.engine}`);
+// Free-form (LLM-driven; runs against the env-resolved adapter via
+// generateUI). With an LLM key → `free-form-composed`; without →
+// `free-form-no-llm`. Smoke verifies the dispatch + shape, not the
+// strategy outcome (which depends on env).
+const t3 = Date.now();
+const r3 = await generateUI({ intent, engine: 'free-form' });
+console.log(`[free-form]    ${Date.now() - t3}ms  msgs=${r3.messages?.length}  strategy=${r3.strategy}  engine=${r3.engine}`);
 // Shape invariants
 const ok =
   Array.isArray(r1.messages) && r1.executionId && r1.validation &&
-  Array.isArray(r2.messages) && r2.validation;
+  Array.isArray(r2.messages) && r2.validation &&
+  Array.isArray(r3.messages) && r3.validation && r3.engine === 'free-form';
 console.log(`\n[smoke] shape invariants: ${ok ? 'ok' : 'FAIL'}`);
 // Retrieval-quality probe — for each canonical intent, the generated
 // component tree's text content must overlap the intent's keywords.
 // This catches retrieval regressions (wrong-domain top hit) that pure
 // shape-validation gates miss.
+// Probes pick intents that match the post-§65 harvested-chunks
+// substrate (auth flows, dashboard variants, settings, errors).
+// Removed: 'pricing tiers' (no pricing surface in shipped /site/ —
+// retrieval honestly returns synthesis-failed; LLM fallback handles
+// the intent at ~9s vs ~25ms).
 const RETRIEVAL_PROBES = [
   { intent: 'login form with email and password',     engine: 'zettel',       expectKeywords: ['sign in', 'login', 'email', 'password'] },
-  { intent: 'pricing tiers with three plans',          engine: 'zettel',       expectKeywords: ['pricing', 'tier', 'plan', 'starter', 'pro', 'enterprise', '$'] },
   { intent: 'sign up form for a new account',          engine: 'zettel',       expectKeywords: ['sign up', 'register', 'create account', 'email'] },
+  { intent: 'admin dashboard with kpi cards',          engine: 'zettel',       expectKeywords: ['dashboard', 'kpi', 'metric', 'revenue', 'users', 'orders', 'conversion'] },
 ];
 function extractText(messages) {

package/scripts/smoke-register-engine.mjs CHANGED Viewed

@@ -7,8 +7,9 @@ const t = (label, ok, detail = '') => {
   else    { console.log(`  ✗ ${label}  ${detail}`); fail++; }
 };
-// Baseline
-t('five built-ins registered', listEngines().length === 5);
+// Baseline — 6 built-ins post-§88 (monolithic-instant, monolithic-pro,
+// monolithic-thinking, zettel, chunk-zettel, free-form).
+t('six built-ins registered', listEngines().length === 6);
 // Happy path
 let customCalled = null;

package/scripts/test-a2ui.mjs CHANGED Viewed

@@ -72,20 +72,23 @@ try {
   bad('LLM adapter', e.message);
 }
-// ── Test 2: Pattern library ─────────────────────────────────────────
+// ── Test 2: Composition library (post-§65 chunks-only substrate) ───
-console.log('\n2. Pattern library');
+console.log('\n2. Composition library');
 const { searchBlocks, listPatterns, lookupDomain } = await import('../../compose/core/reference.js');
-const allPatterns = listPatterns();
-const withTemplates = allPatterns.filter(p => p.template && Array.isArray(p.template));
-const domains = [...new Set(allPatterns.map(p => p.domain).filter(Boolean))];
+const allCompositions = listPatterns();
+const withTemplates = allCompositions.filter(p => p.template && Array.isArray(p.template));
+const domains = [...new Set(allCompositions.map(p => p.domain).filter(Boolean))];
-if (allPatterns.length >= 70) {
-  ok('Pattern count', `${allPatterns.length} total (${withTemplates.length} with templates)`);
+// Post-§65: retrieval surface is the harvested-chunks substrate
+// (~28-32 annotated chunks at the time of v0.4.7). Threshold sized
+// for that floor; grows naturally as more source HTML gets annotated.
+if (allCompositions.length >= 20) {
+  ok('Composition count', `${allCompositions.length} total (${withTemplates.length} with templates)`);
 } else {
-  bad('Pattern count', `only ${allPatterns.length} (expected 70+)`);
+  bad('Composition count', `only ${allCompositions.length} (expected 20+)`);
 }
 if (domains.length >= 3) {
@@ -94,15 +97,17 @@ if (domains.length >= 3) {
   bad('Domains', `only ${domains.length}: ${domains.join(', ')}`);
 }
-// Spot-check known compositions (§64 retired pattern-library; reference.js
-// now reads from composition-library — these are real composition names).
-const spotChecks = ['login-form', 'stat-card-dashboard', 'data-table-paginated', 'settings-admin-page'];
-const foundAll = spotChecks.every(name => allPatterns.some(p => p.name === name));
+// Spot-check chunk names that exist in the harvested substrate. These
+// are real chunk names from /apps/user-flow/, /apps/saas/, etc. —
+// post-§65 the test asserts on actual product surfaces, not on
+// curated composition JSON that's no longer the canonical source.
+const spotChecks = ['auth-signin-card-password', 'auth-signup-entry', 'dashboard-admin-page', 'settings-admin-page'];
+const foundAll = spotChecks.every(name => allCompositions.some(p => p.name === name));
 if (foundAll) {
-  ok('Known patterns', spotChecks.join(', '));
+  ok('Known chunks', spotChecks.join(', '));
 } else {
-  const missing = spotChecks.filter(name => !allPatterns.some(p => p.name === name));
-  bad('Known patterns', `missing: ${missing.join(', ')}`);
+  const missing = spotChecks.filter(name => !allCompositions.some(p => p.name === name));
+  bad('Known chunks', `missing: ${missing.join(', ')}`);
 }
 // ── Test 3: Instant mode gate ───────────────────────────────────────
@@ -119,14 +124,20 @@ function testGate(intent) {
   const intentWords = intent.toLowerCase().split(/\s+/).filter(w => w.length > 2 && !GATE_STOPS.has(w));
   const nameWords = best.name.toLowerCase().split(/[-_\s]+/);
   const matchTags = (best.tags || []).map(t => t.toLowerCase());
+  // Post-§65: harvested chunks carry semantic intent in `keywords` more
+  // than in `tags` (which became {complexity, layout} slots). Include
+  // keywords in the gate so `login → auth-signin-card-password` strong-hits
+  // off the chunk's `keywords: ["login", ...]` field.
+  const matchKeywords = (best.keywords || []).map(k => k.toLowerCase());
   const matchDomain = (best.domain || '').toLowerCase();
   const hasStrongHit = intentWords.some(w => {
     if (w.length < 3) return false;
-    if (nameWords.includes(w) || matchTags.includes(w)) return true;
+    if (nameWords.includes(w) || matchTags.includes(w) || matchKeywords.includes(w)) return true;
     if (w.length >= 4) {
       return nameWords.some(n => n.length >= 3 && (w.startsWith(n) || n.startsWith(w))) ||
-             matchTags.some(t => t.length >= 3 && (w.startsWith(t) || t.startsWith(w)));
+             matchTags.some(t => t.length >= 3 && (w.startsWith(t) || t.startsWith(w))) ||
+             matchKeywords.some(k => k.length >= 3 && (w.startsWith(k) || k.startsWith(w)));
     }
     return false;
   });
@@ -134,19 +145,26 @@ function testGate(intent) {
   const hasWeakHit = !hasStrongHit && intentWords.some(w => {
     return nameWords.some(n => n.length >= 3 && (n.includes(w) || w.includes(n))) ||
            matchTags.some(t => t.length >= 3 && (t.includes(w) || w.includes(t))) ||
+           matchKeywords.some(k => k.length >= 3 && (k.includes(w) || w.includes(k))) ||
            matchDomain.includes(w);
   });
   return { gate: hasStrongHit ? 'STRONG' : hasWeakHit ? 'WEAK' : 'REJECTED', pattern: best.name };
 }
-// Should STRONG match
+// Should STRONG match — restricted to intents covered by the
+// harvested-chunks substrate (auth, dashboard, settings, error pages).
+// Intents previously tested ("pricing table", "chat interface",
+// "todo list", etc.) dropped because §65 retired the curated
+// composition surface — LLM fallback handles those now.
+// Intents need ≥2 content-token hits OR a direct name-token match —
+// short 1-content-word intents (e.g. just "login form") get gated out
+// by composition-library's anti-spurious-match logic. Use intents that
+// land naturally — they're what real users type anyway.
 const strongTests = [
-  ['login form', 'login-form'],
-  ['nav bar', null],         // any match is fine
-  ['dashboard stats', null],
-  ['pricing table', null],
-  ['chat interface', null],
+  ['login with email and password', null],   // → auth-signin-card-password (3 keyword hits)
+  ['admin dashboard kpi', null],              // → dashboard-admin-page
+  ['workspace admin settings', null],         // → settings-admin-page
 ];
 for (const [intent, expected] of strongTests) {
   const { gate, pattern } = testGate(intent);
@@ -159,10 +177,9 @@ for (const [intent, expected] of strongTests) {
 // Should NOT be REJECTED (STRONG or WEAK both acceptable)
 const passTests = [
-  'show me a table',
-  'create a todo list',
-  'user profile card',
+  'sign up for an account',
   'settings page',
+  '404 not found error',
 ];
 for (const intent of passTests) {
   const { gate, pattern } = testGate(intent);
@@ -238,17 +255,21 @@ if (!THINKING) {
 }
 // ── Test 6: Training corpus surfaces ────────────────────────────────
-// (The legacy exemplar extract → ingest path was retired 2026-04-28 in
-// mcp 0.0.5. The chunk corpus is the training surface now.)
+// Post-§65: `compositions/` retired alongside the hand-authored
+// pattern library. The harvested-chunks substrate is the sole
+// retrieval surface; everything else falls through to LLM.
+// (Legacy exemplar extract → ingest path retired 2026-04-28 mcp 0.0.5.)
 console.log('\n6. Training corpus surfaces');
-// 6a. Hand-authored pattern library — should be ≥ 100 entries.
+// 6a. Composition library (harvested chunks via composition-library).
+// Threshold sized for the post-§65 floor (~28 annotated chunks at
+// v0.4.7); grows as more source HTML gets annotated.
 const patterns = listPatterns();
-if (patterns.length >= 100) {
-  ok('Pattern library', `${patterns.length} hand-authored patterns`);
+if (patterns.length >= 20) {
+  ok('Composition library', `${patterns.length} compositions (harvested-chunks substrate)`);
 } else {
-  bad('Pattern library', `only ${patterns.length} (expected ≥ 100)`);
+  bad('Composition library', `only ${patterns.length} (expected ≥ 20)`);
 }
 // 6b. Gen-UI chunk corpus — should be ≥ 500 unique chunks across