@adia-ai/a2ui-mcp 0.4.6 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -11,6 +11,24 @@ zettel strategies.
11
11
 
12
12
  _No pending changes._
13
13
 
14
+ ## [0.4.8] - 2026-05-12
15
+
16
+ ### Changed — eval threshold rebaseline + smoke-script alignment (§87, v0.4.8)
17
+
18
+ Companion to `@adia-ai/a2ui-compose@[Unreleased]` §87 zettel `ensureBooted()` race fix.
19
+
20
+ - **`mcp/scripts/eval-diff.mjs`** — thresholds rebaselined to the honest post-§72 floor (~5%; was holding stale v0.4.6 baseline). Eval coverage gate now reflects the chunks-only retrieval shape after the patterns retirement.
21
+ - **`mcp/scripts/smoke-engine-registry.mjs`** — retrieval-quality probes updated to match the post-§72 + §88 retrieval surface (composition-match strategy on canonical intents).
22
+ - **`mcp/scripts/smoke-register-engine.mjs`** — engine-registry coverage adjusted; still 11/11.
23
+
24
+ See root [CHANGELOG.md `[Unreleased]`](../../../CHANGELOG.md) for the cross-cutting arc.
25
+
26
+ ## [0.4.7] - 2026-05-12
27
+
28
+ ### Changed — smoke + test scripts aligned to post-§72 retrieval surface
29
+
30
+ `scripts/smoke-engine-registry.mjs` retrieval probe set + `scripts/test-a2ui.mjs` (composition-count threshold + spot-checks + intent-gate keyword surface) now exercise the harvested-chunks substrate that survives §72's `corpus/patterns/` + `corpus/compositions/` retirement. Probe for "pricing tiers" dropped (no pricing surface in shipped `/site/`); replaced with "admin dashboard with kpi cards" matching `dashboard-admin-page`. Spot-check names updated to real chunk names (`auth-signin-card-password`, `auth-signup-entry`, `dashboard-admin-page`, `settings-admin-page`). No tool / API change — internal scripts only.
31
+
14
32
  ## [0.4.6] - 2026-05-12
15
33
 
16
34
  ### Changed — patterns surface retired (§64 step 5, 2026-05-12)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adia-ai/a2ui-mcp",
3
- "version": "0.4.6",
3
+ "version": "0.4.8",
4
4
  "description": "AdiaUI A2UI MCP server. Exposes the compose engine over MCP with an engine selector for monolithic + zettel strategies.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -71,14 +71,15 @@ if (gateMode === 'combined' && !semanticEnabled) {
71
71
  process.exit(2);
72
72
  }
73
73
 
74
- if (!['mcp', 'zettel', 'chunk-zettel', 'all'].includes(engine)) {
75
- console.error(`[eval-diff] --engine must be one of: mcp | zettel | chunk-zettel | all (got: ${engine})`);
74
+ if (!['mcp', 'zettel', 'chunk-zettel', 'free-form', 'all'].includes(engine)) {
75
+ console.error(`[eval-diff] --engine must be one of: mcp | zettel | chunk-zettel | free-form | all (got: ${engine})`);
76
76
  process.exit(2);
77
77
  }
78
78
 
79
79
  const runMcp = engine === 'mcp' || engine === 'all';
80
80
  const runZettel = engine === 'zettel' || engine === 'all';
81
81
  const runChunkZettel = engine === 'chunk-zettel' || engine === 'all';
82
+ const runFreeForm = engine === 'free-form' || engine === 'all';
82
83
 
83
84
  // ── MCP adapter: use the top-level patternName exposed by generateInstant ──
84
85
  // Shadow-mode capture: when --semantic is set, remember the emitted messages
@@ -119,10 +120,23 @@ async function generateChunkZettelCapture({ intent, mode }) {
119
120
  return result;
120
121
  }
121
122
 
123
+ async function generateFreeFormCapture({ intent }) {
124
+ // Free-form requires an LLM adapter. generateUI auto-resolves one via
125
+ // createAdapter — same env-resolution path as monolithic-pro. Without
126
+ // a key set, the strategy returns `free-form-no-llm` + empty messages
127
+ // (coverage 0%); that's the honest signal.
128
+ const result = await generateUI({ intent, engine: 'free-form' });
129
+ if (semanticEnabled && Array.isArray(result.messages) && result.messages.length > 0) {
130
+ capturedMessages.set(`free-form:${intent}`, result.messages);
131
+ }
132
+ return result;
133
+ }
134
+
122
135
  // ── Run ──
123
136
  let mcp = null;
124
137
  let zettel = null;
125
138
  let chunkZettel = null;
139
+ let freeForm = null;
126
140
 
127
141
  if (runMcp) {
128
142
  console.error(`[eval-diff] running mcp (monolithic) harness…`);
@@ -160,6 +174,18 @@ if (runChunkZettel) {
160
174
  console.error(` coverage=${chunkZettel.coverage}% emitted=${chunkZettel.emitted}/${chunkZettel.total} avgScore=${chunkZettel.avgScoreWhenEmitted}`);
161
175
  }
162
176
 
177
+ if (runFreeForm) {
178
+ console.error(`[eval-diff] running free-form (LLM-driven chunk-vocabulary composer) harness…`);
179
+ freeForm = await runHarnessV2({
180
+ generate: generateFreeFormCapture,
181
+ domain,
182
+ limit,
183
+ mode: 'instant',
184
+ label: 'free-form',
185
+ });
186
+ console.error(` coverage=${freeForm.coverage}% emitted=${freeForm.emitted}/${freeForm.total} avgScore=${freeForm.avgScoreWhenEmitted}`);
187
+ }
188
+
163
189
  // ── Shadow-mode semantic validation (Phase 1) ──
164
190
  // Opt-in via --semantic. Annotates per-intent rows + aggregates with
165
191
  // semanticScore/verdict/combinedScore. DOES NOT affect row.pass, passRate,
@@ -266,6 +292,7 @@ await mkdir(outDir, { recursive: true });
266
292
  if (mcp) await writeFile(join(outDir, 'mcp.json'), JSON.stringify(mcp, null, 2));
267
293
  if (zettel) await writeFile(join(outDir, 'zettel.json'), JSON.stringify(zettel, null, 2));
268
294
  if (chunkZettel) await writeFile(join(outDir, 'chunk-zettel.json'), JSON.stringify(chunkZettel, null, 2));
295
+ if (freeForm) await writeFile(join(outDir, 'free-form.json'), JSON.stringify(freeForm, null, 2));
269
296
 
270
297
  // ── Build diff.md ──
271
298
  function fmt(v) { return v == null ? '—' : String(v); }
@@ -278,10 +305,10 @@ function winner(a, b) {
278
305
  }
279
306
 
280
307
  let md = '';
281
- md += `# Engine Eval ${[mcp, zettel, chunkZettel].filter(Boolean).length > 1 ? 'Diff' : 'Report'}\n\n`;
308
+ md += `# Engine Eval ${[mcp, zettel, chunkZettel, freeForm].filter(Boolean).length > 1 ? 'Diff' : 'Report'}\n\n`;
282
309
  md += `- Run: \`${stamp}\`\n`;
283
310
  md += `- Engine(s): ${engine}\n`;
284
- md += `- Intents: ${(mcp || zettel || chunkZettel).total}${domain ? ` (domain: ${domain})` : ''}${limit ? ` (limit: ${limit})` : ''}\n`;
311
+ md += `- Intents: ${(mcp || zettel || chunkZettel || freeForm).total}${domain ? ` (domain: ${domain})` : ''}${limit ? ` (limit: ${limit})` : ''}\n`;
285
312
  md += `- Mode: instant\n`;
286
313
  if (semanticEnabled) {
287
314
  md += `- Semantic: ${gateMode === 'combined' ? `gating (threshold=${gateThreshold})` : 'shadow'}\n`;
@@ -303,8 +330,8 @@ if (mcp && zettel) {
303
330
  }
304
331
  md += `| retrieval MRR | ${fmt(mcp.retrievalMRR)} | ${fmt(zettel.retrievalMRR)} |\n\n`;
305
332
  } else {
306
- const e = mcp || zettel || chunkZettel;
307
- const label = mcp ? 'mcp' : zettel ? 'zettel' : 'chunk-zettel';
333
+ const e = mcp || zettel || chunkZettel || freeForm;
334
+ const label = mcp ? 'mcp' : zettel ? 'zettel' : chunkZettel ? 'chunk-zettel' : 'free-form';
308
335
  md += `| metric | ${label} |\n|---|---:|\n`;
309
336
  md += `| coverage % | ${e.coverage} |\n`;
310
337
  md += `| emitted | ${e.emitted}/${e.total} |\n`;
@@ -352,8 +379,8 @@ if (mcp && zettel) {
352
379
  console.error(` ties: ${counts.tie || 0}`);
353
380
  console.error(` both missed: ${counts['both-miss'] || 0}`);
354
381
  } else {
355
- const e = mcp || zettel || chunkZettel;
356
- const label = mcp ? 'mcp' : zettel ? 'zettel' : 'chunk-zettel';
382
+ const e = mcp || zettel || chunkZettel || freeForm;
383
+ const label = mcp ? 'mcp' : zettel ? 'zettel' : chunkZettel ? 'chunk-zettel' : 'free-form';
357
384
  md += `## Strategy breakdown\n\n`;
358
385
  md += `**${label}**: ` + Object.entries(e.strategyBreakdown).map(([k, v]) => `${k}=${v}`).join(', ') + `\n\n`;
359
386
  md += `## Per-intent\n\n`;
@@ -17,9 +17,11 @@ console.log('[smoke] engines registered:', listEngines().join(', '));
17
17
  const monoInstant = pick({ engine: 'monolithic', mode: 'instant' });
18
18
  const monoPro = pick({ engine: 'monolithic', mode: 'pro' });
19
19
  const zettel = pick({ engine: 'zettel' });
20
+ const freeForm = pick({ engine: 'free-form' });
20
21
  console.log('[smoke] pick monolithic/instant:', monoInstant === ENGINES['monolithic-instant'] ? 'ok' : 'FAIL');
21
22
  console.log('[smoke] pick monolithic/pro: ', monoPro === ENGINES['monolithic-pro'] ? 'ok' : 'FAIL');
22
23
  console.log('[smoke] pick zettel: ', zettel === ENGINES.zettel ? 'ok' : 'FAIL');
24
+ console.log('[smoke] pick free-form: ', freeForm === ENGINES['free-form'] ? 'ok' : 'FAIL');
23
25
  console.log('[smoke] pick unknown → fallback:', pick({ engine: 'xxx', mode: 'xxx' }) === ENGINES['monolithic-instant'] ? 'ok' : 'FAIL');
24
26
 
25
27
  const intent = 'login form with email and password';
@@ -34,20 +36,34 @@ const t2 = Date.now();
34
36
  const r2 = await generateUI({ intent, engine: 'zettel' });
35
37
  console.log(`[zettel] ${Date.now() - t2}ms msgs=${r2.messages?.length} valid=${r2.validation?.valid} score=${r2.validation?.score} strategy=${r2.strategy} engine=${r2.engine}`);
36
38
 
39
+ // Free-form (LLM-driven; runs against the env-resolved adapter via
40
+ // generateUI). With an LLM key → `free-form-composed`; without →
41
+ // `free-form-no-llm`. Smoke verifies the dispatch + shape, not the
42
+ // strategy outcome (which depends on env).
43
+ const t3 = Date.now();
44
+ const r3 = await generateUI({ intent, engine: 'free-form' });
45
+ console.log(`[free-form] ${Date.now() - t3}ms msgs=${r3.messages?.length} strategy=${r3.strategy} engine=${r3.engine}`);
46
+
37
47
  // Shape invariants
38
48
  const ok =
39
49
  Array.isArray(r1.messages) && r1.executionId && r1.validation &&
40
- Array.isArray(r2.messages) && r2.validation;
50
+ Array.isArray(r2.messages) && r2.validation &&
51
+ Array.isArray(r3.messages) && r3.validation && r3.engine === 'free-form';
41
52
  console.log(`\n[smoke] shape invariants: ${ok ? 'ok' : 'FAIL'}`);
42
53
 
43
54
  // Retrieval-quality probe — for each canonical intent, the generated
44
55
  // component tree's text content must overlap the intent's keywords.
45
56
  // This catches retrieval regressions (wrong-domain top hit) that pure
46
57
  // shape-validation gates miss.
58
+ // Probes pick intents that match the post-§65 harvested-chunks
59
+ // substrate (auth flows, dashboard variants, settings, errors).
60
+ // Removed: 'pricing tiers' (no pricing surface in shipped /site/ —
61
+ // retrieval honestly returns synthesis-failed; LLM fallback handles
62
+ // the intent at ~9s vs ~25ms).
47
63
  const RETRIEVAL_PROBES = [
48
64
  { intent: 'login form with email and password', engine: 'zettel', expectKeywords: ['sign in', 'login', 'email', 'password'] },
49
- { intent: 'pricing tiers with three plans', engine: 'zettel', expectKeywords: ['pricing', 'tier', 'plan', 'starter', 'pro', 'enterprise', '$'] },
50
65
  { intent: 'sign up form for a new account', engine: 'zettel', expectKeywords: ['sign up', 'register', 'create account', 'email'] },
66
+ { intent: 'admin dashboard with kpi cards', engine: 'zettel', expectKeywords: ['dashboard', 'kpi', 'metric', 'revenue', 'users', 'orders', 'conversion'] },
51
67
  ];
52
68
 
53
69
  function extractText(messages) {
@@ -7,8 +7,9 @@ const t = (label, ok, detail = '') => {
7
7
  else { console.log(` ✗ ${label} ${detail}`); fail++; }
8
8
  };
9
9
 
10
- // Baseline
11
- t('five built-ins registered', listEngines().length === 5);
10
+ // Baseline — 6 built-ins post-§88 (monolithic-instant, monolithic-pro,
11
+ // monolithic-thinking, zettel, chunk-zettel, free-form).
12
+ t('six built-ins registered', listEngines().length === 6);
12
13
 
13
14
  // Happy path
14
15
  let customCalled = null;
@@ -72,20 +72,23 @@ try {
72
72
  bad('LLM adapter', e.message);
73
73
  }
74
74
 
75
- // ── Test 2: Pattern library ─────────────────────────────────────────
75
+ // ── Test 2: Composition library (post-§65 chunks-only substrate) ───
76
76
 
77
- console.log('\n2. Pattern library');
77
+ console.log('\n2. Composition library');
78
78
 
79
79
  const { searchBlocks, listPatterns, lookupDomain } = await import('../../compose/core/reference.js');
80
80
 
81
- const allPatterns = listPatterns();
82
- const withTemplates = allPatterns.filter(p => p.template && Array.isArray(p.template));
83
- const domains = [...new Set(allPatterns.map(p => p.domain).filter(Boolean))];
81
+ const allCompositions = listPatterns();
82
+ const withTemplates = allCompositions.filter(p => p.template && Array.isArray(p.template));
83
+ const domains = [...new Set(allCompositions.map(p => p.domain).filter(Boolean))];
84
84
 
85
- if (allPatterns.length >= 70) {
86
- ok('Pattern count', `${allPatterns.length} total (${withTemplates.length} with templates)`);
85
+ // Post-§65: retrieval surface is the harvested-chunks substrate
86
+ // (~28-32 annotated chunks at the time of v0.4.7). Threshold sized
87
+ // for that floor; grows naturally as more source HTML gets annotated.
88
+ if (allCompositions.length >= 20) {
89
+ ok('Composition count', `${allCompositions.length} total (${withTemplates.length} with templates)`);
87
90
  } else {
88
- bad('Pattern count', `only ${allPatterns.length} (expected 70+)`);
91
+ bad('Composition count', `only ${allCompositions.length} (expected 20+)`);
89
92
  }
90
93
 
91
94
  if (domains.length >= 3) {
@@ -94,15 +97,17 @@ if (domains.length >= 3) {
94
97
  bad('Domains', `only ${domains.length}: ${domains.join(', ')}`);
95
98
  }
96
99
 
97
- // Spot-check known compositions (§64 retired pattern-library; reference.js
98
- // now reads from composition-library these are real composition names).
99
- const spotChecks = ['login-form', 'stat-card-dashboard', 'data-table-paginated', 'settings-admin-page'];
100
- const foundAll = spotChecks.every(name => allPatterns.some(p => p.name === name));
100
+ // Spot-check chunk names that exist in the harvested substrate. These
101
+ // are real chunk names from /apps/user-flow/, /apps/saas/, etc.
102
+ // post-§65 the test asserts on actual product surfaces, not on
103
+ // curated composition JSON that's no longer the canonical source.
104
+ const spotChecks = ['auth-signin-card-password', 'auth-signup-entry', 'dashboard-admin-page', 'settings-admin-page'];
105
+ const foundAll = spotChecks.every(name => allCompositions.some(p => p.name === name));
101
106
  if (foundAll) {
102
- ok('Known patterns', spotChecks.join(', '));
107
+ ok('Known chunks', spotChecks.join(', '));
103
108
  } else {
104
- const missing = spotChecks.filter(name => !allPatterns.some(p => p.name === name));
105
- bad('Known patterns', `missing: ${missing.join(', ')}`);
109
+ const missing = spotChecks.filter(name => !allCompositions.some(p => p.name === name));
110
+ bad('Known chunks', `missing: ${missing.join(', ')}`);
106
111
  }
107
112
 
108
113
  // ── Test 3: Instant mode gate ───────────────────────────────────────
@@ -119,14 +124,20 @@ function testGate(intent) {
119
124
  const intentWords = intent.toLowerCase().split(/\s+/).filter(w => w.length > 2 && !GATE_STOPS.has(w));
120
125
  const nameWords = best.name.toLowerCase().split(/[-_\s]+/);
121
126
  const matchTags = (best.tags || []).map(t => t.toLowerCase());
127
+ // Post-§65: harvested chunks carry semantic intent in `keywords` more
128
+ // than in `tags` (which became {complexity, layout} slots). Include
129
+ // keywords in the gate so `login → auth-signin-card-password` strong-hits
130
+ // off the chunk's `keywords: ["login", ...]` field.
131
+ const matchKeywords = (best.keywords || []).map(k => k.toLowerCase());
122
132
  const matchDomain = (best.domain || '').toLowerCase();
123
133
 
124
134
  const hasStrongHit = intentWords.some(w => {
125
135
  if (w.length < 3) return false;
126
- if (nameWords.includes(w) || matchTags.includes(w)) return true;
136
+ if (nameWords.includes(w) || matchTags.includes(w) || matchKeywords.includes(w)) return true;
127
137
  if (w.length >= 4) {
128
138
  return nameWords.some(n => n.length >= 3 && (w.startsWith(n) || n.startsWith(w))) ||
129
- matchTags.some(t => t.length >= 3 && (w.startsWith(t) || t.startsWith(w)));
139
+ matchTags.some(t => t.length >= 3 && (w.startsWith(t) || t.startsWith(w))) ||
140
+ matchKeywords.some(k => k.length >= 3 && (w.startsWith(k) || k.startsWith(w)));
130
141
  }
131
142
  return false;
132
143
  });
@@ -134,19 +145,26 @@ function testGate(intent) {
134
145
  const hasWeakHit = !hasStrongHit && intentWords.some(w => {
135
146
  return nameWords.some(n => n.length >= 3 && (n.includes(w) || w.includes(n))) ||
136
147
  matchTags.some(t => t.length >= 3 && (t.includes(w) || w.includes(t))) ||
148
+ matchKeywords.some(k => k.length >= 3 && (k.includes(w) || w.includes(k))) ||
137
149
  matchDomain.includes(w);
138
150
  });
139
151
 
140
152
  return { gate: hasStrongHit ? 'STRONG' : hasWeakHit ? 'WEAK' : 'REJECTED', pattern: best.name };
141
153
  }
142
154
 
143
- // Should STRONG match
155
+ // Should STRONG match — restricted to intents covered by the
156
+ // harvested-chunks substrate (auth, dashboard, settings, error pages).
157
+ // Intents previously tested ("pricing table", "chat interface",
158
+ // "todo list", etc.) dropped because §65 retired the curated
159
+ // composition surface — LLM fallback handles those now.
160
+ // Intents need ≥2 content-token hits OR a direct name-token match —
161
+ // short 1-content-word intents (e.g. just "login form") get gated out
162
+ // by composition-library's anti-spurious-match logic. Use intents that
163
+ // land naturally — they're what real users type anyway.
144
164
  const strongTests = [
145
- ['login form', 'login-form'],
146
- ['nav bar', null], // any match is fine
147
- ['dashboard stats', null],
148
- ['pricing table', null],
149
- ['chat interface', null],
165
+ ['login with email and password', null], // → auth-signin-card-password (3 keyword hits)
166
+ ['admin dashboard kpi', null], // dashboard-admin-page
167
+ ['workspace admin settings', null], // → settings-admin-page
150
168
  ];
151
169
  for (const [intent, expected] of strongTests) {
152
170
  const { gate, pattern } = testGate(intent);
@@ -159,10 +177,9 @@ for (const [intent, expected] of strongTests) {
159
177
 
160
178
  // Should NOT be REJECTED (STRONG or WEAK both acceptable)
161
179
  const passTests = [
162
- 'show me a table',
163
- 'create a todo list',
164
- 'user profile card',
180
+ 'sign up for an account',
165
181
  'settings page',
182
+ '404 not found error',
166
183
  ];
167
184
  for (const intent of passTests) {
168
185
  const { gate, pattern } = testGate(intent);
@@ -238,17 +255,21 @@ if (!THINKING) {
238
255
  }
239
256
 
240
257
  // ── Test 6: Training corpus surfaces ────────────────────────────────
241
- // (The legacy exemplar extract ingest path was retired 2026-04-28 in
242
- // mcp 0.0.5. The chunk corpus is the training surface now.)
258
+ // Post-§65: `compositions/` retired alongside the hand-authored
259
+ // pattern library. The harvested-chunks substrate is the sole
260
+ // retrieval surface; everything else falls through to LLM.
261
+ // (Legacy exemplar extract → ingest path retired 2026-04-28 mcp 0.0.5.)
243
262
 
244
263
  console.log('\n6. Training corpus surfaces');
245
264
 
246
- // 6a. Hand-authored pattern library should be ≥ 100 entries.
265
+ // 6a. Composition library (harvested chunks via composition-library).
266
+ // Threshold sized for the post-§65 floor (~28 annotated chunks at
267
+ // v0.4.7); grows as more source HTML gets annotated.
247
268
  const patterns = listPatterns();
248
- if (patterns.length >= 100) {
249
- ok('Pattern library', `${patterns.length} hand-authored patterns`);
269
+ if (patterns.length >= 20) {
270
+ ok('Composition library', `${patterns.length} compositions (harvested-chunks substrate)`);
250
271
  } else {
251
- bad('Pattern library', `only ${patterns.length} (expected ≥ 100)`);
272
+ bad('Composition library', `only ${patterns.length} (expected ≥ 20)`);
252
273
  }
253
274
 
254
275
  // 6b. Gen-UI chunk corpus — should be ≥ 500 unique chunks across