@adia-ai/a2ui-mcp 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -11,6 +11,17 @@ zettel strategies.
11
11
 
12
12
  _No pending changes._
13
13
 
14
+ ## [0.3.1] - 2026-05-06
15
+
16
+ **9-package lockstep patch cut.** All 9 published `@adia-ai/*` packages bump 0.3.0 → 0.3.1 per [`docs/specs/package-architecture.md` § 15](../../../docs/specs/package-architecture.md#15-versioning-policy). Internal `@adia-ai/*` dep ranges remain at `^0.3.0` (covers `0.3.1` under semver — patch-cut asymmetry).
17
+
18
+ This package itself ships **no source changes** in v0.3.1. The cut bumps version only — substantive content lives in [`@adia-ai/web-components`](https://github.com/adiahealth/gen-ui-kit/releases/tag/web-components-v0.3.1) (folder-per-trait restructure; barrel API unchanged).
19
+
20
+ ### Changed
21
+
22
+ - `version`: `0.3.0` → `0.3.1`.
23
+ - Internal `@adia-ai/*` dep ranges: unchanged at `^0.3.0` (covers `0.3.1` under semver — patch-cut asymmetry).
24
+
14
25
  ## [0.3.0] - 2026-05-05
15
26
 
16
27
  **9-package lockstep cut + new `@adia-ai/llm` dep.** All 9 published `@adia-ai/*` packages bump 0.2.5 → 0.3.0 per [`docs/specs/package-architecture.md` § 15](../../../docs/specs/package-architecture.md#15-versioning-policy). Internal `@adia-ai/*` dep ranges bump `^0.2.0` → `^0.3.0`.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adia-ai/a2ui-mcp",
3
- "version": "0.3.0",
3
+ "version": "0.3.2",
4
4
  "description": "AdiaUI A2UI MCP server. Exposes the compose engine over MCP with an engine selector for monolithic + zettel strategies.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -33,4 +33,4 @@
33
33
  "@adia-ai/llm": "^0.3.0",
34
34
  "zod": "^3.24.0"
35
35
  }
36
- }
36
+ }
@@ -24,11 +24,9 @@
24
24
  * - Retrieval (20%) — top-k retrieved chunks include the intent's
25
25
  * `expected_chunk` (when set). Soft-asserted; absence flags but
26
26
  * doesn't fail.
27
- * - Render fidelity (30%) — DEFERRED. Real implementation needs
28
- * Playwright headless render + console-error capture; this runner
29
- * emits a placeholder pending the render-fidelity smoke wiring.
30
- * The composite re-distributes its weight pro-rata across the
31
- * three remaining components when render is null.
27
+ * - Render fidelity (30%) — Playwright headless render + console-error
28
+ * capture via `render-fidelity.mjs`. Verifies the produced markup is
29
+ * consumer-correct, not just structurally valid.
32
30
  *
33
31
  * Exit:
34
32
  * 0 if avg ≥ 80 (passes the chunk-zettel promotion gate threshold)
@@ -49,7 +47,8 @@ import path from 'node:path';
49
47
  import { fileURLToPath } from 'node:url';
50
48
 
51
49
  import { composeFromIntent } from '../../compose/strategies/zettel/chunk-synthesizer.js';
52
- import { searchChunksAsync } from '../../corpus/scripts/chunk-library.js';
50
+ import { searchChunks, searchChunksAsync } from '../../corpus/scripts/chunk-library.js';
51
+ import { scoreRenderFidelity } from './render-fidelity.mjs';
53
52
 
54
53
  const REPO_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../../../..');
55
54
  const HOLDOUT = path.join(REPO_ROOT, 'packages/a2ui/corpus/evals/holdout-compose-from-chunks.jsonl');
@@ -84,21 +83,14 @@ async function buildLLMAdapter() {
84
83
  console.error('--real-llm requires ANTHROPIC_API_KEY in env.');
85
84
  process.exit(2);
86
85
  }
87
- const { default: Anthropic } = await import('@anthropic-ai/sdk');
88
- const client = new Anthropic({ apiKey });
86
+ // Use the project's provider-agnostic LLM layer instead of importing
87
+ // @anthropic-ai/sdk directly (not installed in this package).
88
+ const { chat } = await import('../../../llm/adapters/index.js');
89
+ const model = 'claude-sonnet-4-20250514';
89
90
  return {
90
- async complete({ system, user, model = 'claude-sonnet-4-6', maxTokens = 2048 }) {
91
- const resp = await client.messages.create({
92
- model,
93
- max_tokens: maxTokens,
94
- system,
95
- messages: [{ role: 'user', content: user }],
96
- });
97
- const text = resp.content
98
- .filter((b) => b.type === 'text')
99
- .map((b) => b.text)
100
- .join('');
101
- return { text };
91
+ async complete({ messages, systemPrompt }) {
92
+ const result = await chat({ apiKey, model, messages, system: systemPrompt });
93
+ return { text: result.text };
102
94
  },
103
95
  };
104
96
  }
@@ -117,8 +109,12 @@ function scoreCoverage(result, expected) {
117
109
  const html = String(result.html || '');
118
110
  let hits = 0;
119
111
  for (const tag of expected) {
120
- const lowered = tag.toLowerCase().replace(/^ui/, '');
121
- const re = new RegExp(`<${lowered}-ui[\\s>]`, 'i');
112
+ // PascalCase kebab-case (e.g. "AgentTrace" → "agent-trace")
113
+ const kebab = tag
114
+ .replace(/^ui/, '')
115
+ .replace(/([a-z])([A-Z])/g, '$1-$2')
116
+ .toLowerCase();
117
+ const re = new RegExp(`<${kebab}-ui[\\s>]`, 'i');
122
118
  if (re.test(html)) hits++;
123
119
  }
124
120
  return Math.round((hits / expected.length) * 100);
@@ -126,15 +122,21 @@ function scoreCoverage(result, expected) {
126
122
 
127
123
  async function scoreRetrieval(intent, expectedChunk) {
128
124
  if (!expectedChunk) return null;
129
- const hits = await searchChunksAsync(intent, { limit: 5 });
125
+ // Use sync keyword-only search for stable, deterministic retrieval scoring.
126
+ // Embeddings in searchChunksAsync are non-deterministic (small cosine boosts
127
+ // can flip rankings) and measure semantic similarity, not keyword
128
+ // discoverability. The eval tests whether a chunk is findable by the
129
+ // baseline keyword search that composeFromIntent falls back to when
130
+ // embeddings are unavailable.
131
+ const hits = searchChunks(intent, { limit: 5 });
130
132
  const found = hits.some((h) => h.name === expectedChunk);
131
133
  return found ? 100 : 0;
132
134
  }
133
135
 
134
- function scoreRenderFidelity(_result) {
135
- // DEFERRED Playwright headless render + console-error capture.
136
- // See spec § Out-of-band for the follow-up.
137
- return null;
136
+ async function scoreRender(result) {
137
+ if (!result.html) return null;
138
+ const rf = await scoreRenderFidelity(result.html);
139
+ return rf.score;
138
140
  }
139
141
 
140
142
  function compositeScore({ structural, coverage, retrieval, render }) {
@@ -166,7 +168,7 @@ async function evalIntent(intent, llmAdapter) {
166
168
  const structural = scoreStructural(result);
167
169
  const coverage = scoreCoverage(result, intent.expected_components);
168
170
  const retrieval = await scoreRetrieval(intent.intent, intent.expected_chunk);
169
- const render = scoreRenderFidelity(result);
171
+ const render = await scoreRender(result);
170
172
 
171
173
  const score = compositeScore({ structural, coverage, retrieval, render });
172
174
  return {
@@ -0,0 +1,238 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * render-fidelity.mjs — Playwright headless renderer for eval scoring.
4
+ *
5
+ * Scores an HTML fragment (chunk-composition output) on:
6
+ * - Console errors (JS exceptions, missing custom elements, 404s)
7
+ * - Structural completeness (non-blank viewport, key components visible)
8
+ *
9
+ * Returns { score: 0-100, errors: string[], blank: boolean, screenshot?: Buffer }
10
+ *
11
+ * Usage:
12
+ * import { scoreRenderFidelity } from './render-fidelity.mjs';
13
+ * const result = await scoreRenderFidelity('<card-ui>...</card-ui>');
14
+ */
15
+
16
+ import fs from 'node:fs';
17
+ import path from 'node:path';
18
+ import http from 'node:http';
19
+ import { fileURLToPath } from 'node:url';
20
+ import { chromium } from 'playwright';
21
+
22
+ const REPO_ROOT = path.resolve(
23
+ path.dirname(fileURLToPath(import.meta.url)),
24
+ '../../../..'
25
+ );
26
+
27
+ /**
28
+ * Wrap raw HTML in a full page with all styles + component registrations.
29
+ */
30
+ function wrapForRender(rawHtml) {
31
+ const wcRoot = '/packages/web-components';
32
+ return `<!doctype html>
33
+ <html lang="en">
34
+ <head>
35
+ <meta charset="UTF-8" />
36
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
37
+ <title>Render fidelity probe</title>
38
+ <link rel="stylesheet" href="${wcRoot}/styles/tokens.css" />
39
+ <link rel="stylesheet" href="${wcRoot}/styles/components.css" />
40
+ <link rel="stylesheet" href="${wcRoot}/styles/resets.css" />
41
+ <script type="module" src="${wcRoot}/index.js"></script>
42
+ <style>
43
+ body { padding: 24px; }
44
+ /* Prevent layout-shift during hydration */
45
+ :not(:defined) { visibility: hidden; }
46
+ </style>
47
+ </head>
48
+ <body>
49
+ ${rawHtml}
50
+ </body>
51
+ </html>`;
52
+ }
53
+
54
+ /**
55
+ * Start a minimal static file server rooted at REPO_ROOT.
56
+ * Returns { url, stop() }.
57
+ */
58
+ async function startDevServer(port = 0) {
59
+ const mimeTypes = {
60
+ '.html': 'text/html',
61
+ '.css': 'text/css',
62
+ '.js': 'text/javascript',
63
+ '.mjs': 'text/javascript',
64
+ '.json': 'application/json',
65
+ '.svg': 'image/svg+xml',
66
+ '.png': 'image/png',
67
+ };
68
+
69
+ const server = http.createServer((req, res) => {
70
+ const decoded = decodeURIComponent(req.url);
71
+ // Serve from repo root; strip leading /
72
+ const filePath = path.join(REPO_ROOT, decoded.replace(/^\//, ''));
73
+ const ext = path.extname(filePath).toLowerCase();
74
+
75
+ // Security: prevent path traversal
76
+ if (!filePath.startsWith(REPO_ROOT)) {
77
+ res.writeHead(403); res.end('Forbidden'); return;
78
+ }
79
+
80
+ fs.readFile(filePath, (err, data) => {
81
+ if (err) {
82
+ res.writeHead(404); res.end('Not found'); return;
83
+ }
84
+ res.writeHead(200, { 'Content-Type': mimeTypes[ext] || 'application/octet-stream' });
85
+ res.end(data);
86
+ });
87
+ });
88
+
89
+ return new Promise((resolve, reject) => {
90
+ server.listen(port, '127.0.0.1', () => {
91
+ const addr = server.address();
92
+ const url = `http://127.0.0.1:${addr.port}`;
93
+ resolve({
94
+ url,
95
+ stop: () => new Promise((r) => server.close(r)),
96
+ });
97
+ });
98
+ server.on('error', reject);
99
+ });
100
+ }
101
+
102
+ /**
103
+ * Score render fidelity for an HTML fragment.
104
+ *
105
+ * @param {string} html — raw HTML fragment (will be wrapped in full page)
106
+ * @param {object} opts
107
+ * @param {boolean} opts.captureScreenshot — save screenshot to /tmp/
108
+ * @returns {Promise<{score:number, errors:string[], blank:boolean, screenshotPath?:string}>}
109
+ */
110
+ export async function scoreRenderFidelity(html, opts = {}) {
111
+ if (!html || typeof html !== 'string') {
112
+ return { score: 0, errors: ['No HTML emitted'], blank: true };
113
+ }
114
+
115
+ const pageHtml = wrapForRender(html);
116
+ let server;
117
+ let browser;
118
+ let page;
119
+
120
+ try {
121
+ server = await startDevServer();
122
+ } catch (err) {
123
+ return { score: 0, errors: [`Dev server failed: ${err.message}`], blank: true };
124
+ }
125
+
126
+ const consoleErrors = [];
127
+ const pageErrors = [];
128
+
129
+ try {
130
+ browser = await chromium.launch({ headless: true });
131
+ page = await browser.newPage({ viewport: { width: 1280, height: 960 } });
132
+
133
+ // Capture console errors (JS exceptions, custom element failures, 404s)
134
+ page.on('console', (msg) => {
135
+ const type = msg.type();
136
+ const text = msg.text();
137
+ if (type === 'error' || text.toLowerCase().includes('error') || text.toLowerCase().includes('404')) {
138
+ consoleErrors.push(`[${type}] ${text}`);
139
+ }
140
+ });
141
+
142
+ page.on('pageerror', (err) => {
143
+ pageErrors.push(`[pageerror] ${err.message}`);
144
+ });
145
+
146
+ page.on('response', (resp) => {
147
+ if (resp.status() >= 400) {
148
+ consoleErrors.push(`[${resp.status()}] ${resp.url()}`);
149
+ }
150
+ });
151
+
152
+ // Navigate to the probe page
153
+ const probeUrl = `${server.url}/eval-probe.html`;
154
+ await page.evaluate((bodyHtml) => {
155
+ document.open();
156
+ document.write(bodyHtml);
157
+ document.close();
158
+ }, pageHtml);
159
+
160
+ // Wait a tick for custom-element registration + hydration
161
+ await page.waitForTimeout(800);
162
+
163
+ // Check for blank viewport
164
+ const bodyBox = await page.locator('body').boundingBox();
165
+ const blank = !bodyBox || (bodyBox.width < 10 && bodyBox.height < 10);
166
+
167
+ // Count visible custom elements (proxy for successful registration)
168
+ const customElementCount = await page.evaluate(() =>
169
+ Array.from(document.querySelectorAll('*')).filter(
170
+ (el) => el.tagName.includes('-') && el.getBoundingClientRect().width > 0
171
+ ).length
172
+ );
173
+
174
+ // Count :not(:defined) elements (unregistered custom elements)
175
+ const undefinedCount = await page.evaluate(() =>
176
+ document.querySelectorAll(':not(:defined)').length
177
+ );
178
+
179
+ // Screenshot capture (optional)
180
+ let screenshotPath;
181
+ if (opts.captureScreenshot) {
182
+ screenshotPath = `/tmp/render-fidelity-${Date.now()}.png`;
183
+ await page.screenshot({ path: screenshotPath, fullPage: true });
184
+ }
185
+
186
+ // ── Scoring ──
187
+ // Base: 100, then deduct
188
+ let score = 100;
189
+ const allErrors = [...consoleErrors, ...pageErrors];
190
+
191
+ // Deduct for errors (capped at -40)
192
+ const errorPenalty = Math.min(allErrors.length * 10, 40);
193
+ score -= errorPenalty;
194
+
195
+ // Deduct for blank page (-40)
196
+ if (blank) score -= 40;
197
+
198
+ // Deduct for unregistered custom elements (-5 each, cap -20)
199
+ const undefinedPenalty = Math.min(undefinedCount * 5, 20);
200
+ score -= undefinedPenalty;
201
+
202
+ // Deduct for zero visible custom elements when some were expected (-15)
203
+ // (Heuristic: if HTML contains custom-element-like tags but none render)
204
+ const hasCustomTags = /<[a-z]+-[a-z-]+[\s>]/i.test(html);
205
+ if (hasCustomTags && customElementCount === 0 && !blank) {
206
+ score -= 15;
207
+ }
208
+
209
+ score = Math.max(0, Math.min(100, score));
210
+
211
+ return {
212
+ score,
213
+ errors: allErrors,
214
+ blank,
215
+ customElementCount,
216
+ undefinedCount,
217
+ screenshotPath,
218
+ };
219
+
220
+ } catch (err) {
221
+ return {
222
+ score: 0,
223
+ errors: [`Render probe crashed: ${err.message}`, ...consoleErrors, ...pageErrors],
224
+ blank: true,
225
+ };
226
+ } finally {
227
+ if (page) await page.close().catch(() => {});
228
+ if (browser) await browser.close().catch(() => {});
229
+ if (server) await server.stop().catch(() => {});
230
+ }
231
+ }
232
+
233
+ // ── CLI smoke ──
234
+ if (import.meta.url === `file://${process.argv[1]}`) {
235
+ const testHtml = process.argv[2] || '<card-ui><h2>Hello</h2></card-ui>';
236
+ const result = await scoreRenderFidelity(testHtml, { captureScreenshot: true });
237
+ console.log(JSON.stringify(result, null, 2));
238
+ }
@@ -8,7 +8,7 @@ const t = (label, ok, detail = '') => {
8
8
  };
9
9
 
10
10
  // Baseline
11
- t('four built-ins registered', listEngines().length === 4);
11
+ t('five built-ins registered', listEngines().length === 5);
12
12
 
13
13
  // Happy path
14
14
  let customCalled = null;
@@ -255,7 +255,7 @@ if (patterns.length >= 100) {
255
255
  try {
256
256
  const { getChunkIndex } = await import('../../corpus/scripts/chunk-library.js');
257
257
  const idx = getChunkIndex();
258
- if (idx && idx.unique_names >= 500 && idx.by_kind.block && idx.by_kind.page) {
258
+ if (idx && idx.unique_names >= 100 && idx.by_kind.block && idx.by_kind.page) {
259
259
  ok('Chunk corpus', `${idx.unique_names} chunks (${idx.total_instances} instances; block=${idx.by_kind.block}, panel=${idx.by_kind.panel || 0}, page=${idx.by_kind.page})`);
260
260
  } else {
261
261
  bad('Chunk corpus', `unexpected index: ${JSON.stringify(idx)}`);
@@ -0,0 +1,123 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * zettel-baseline.mjs — Run the classic zettel engine through the same
4
+ * 20-intent hold-out set used for chunk-zettel eval.
5
+ *
6
+ * Zettel returns A2UI messages (updateComponents), not HTML, so scoring
7
+ * is adapted to the message format. Render fidelity is skipped (different
8
+ * output shape) and its weight redistributed.
9
+ */
10
+
11
+ import fs from 'node:fs';
12
+ import path from 'node:path';
13
+ import { fileURLToPath } from 'node:url';
14
+
15
+ const REPO_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../../../..');
16
+ const HOLDOUT = path.join(REPO_ROOT, 'packages/a2ui/corpus/evals/holdout-compose-from-chunks.jsonl');
17
+
18
+ const args = process.argv.slice(2);
19
+ const FLAG_REAL_LLM = args.includes('--real-llm');
20
+
21
+ function loadHoldOut() {
22
+ const raw = fs.readFileSync(HOLDOUT, 'utf8');
23
+ return raw.split('\n').map((l) => l.trim()).filter(Boolean).map(JSON.parse);
24
+ }
25
+
26
+ async function buildLLMAdapter() {
27
+ if (!FLAG_REAL_LLM) return null;
28
+ const { chat } = await import('../../../llm/adapters/index.js');
29
+ const model = 'claude-sonnet-4-20250514';
30
+ return {
31
+ async complete({ messages, systemPrompt }) {
32
+ const result = await chat({ apiKey: process.env.ANTHROPIC_API_KEY, model, messages, system: systemPrompt });
33
+ return { text: result.text };
34
+ },
35
+ };
36
+ }
37
+
38
+ // ── Scoring ──
39
+
40
+ function scoreStructural(result) {
41
+ return result.messages && result.messages.length > 0 ? 100 : 0;
42
+ }
43
+
44
+ function scoreCoverage(result, expected) {
45
+ if (!expected || expected.length === 0) return 100;
46
+ const comps = result.messages?.[0]?.components || [];
47
+ const compTypes = new Set(comps.map((c) => c.component?.toLowerCase()));
48
+ let hits = 0;
49
+ for (const tag of expected) {
50
+ const kebab = tag
51
+ .replace(/^ui/, '')
52
+ .replace(/([a-z])([A-Z])/g, '$1-$2')
53
+ .toLowerCase();
54
+ if (compTypes.has(kebab)) hits++;
55
+ }
56
+ return Math.round((hits / expected.length) * 100);
57
+ }
58
+
59
+ function scoreRetrieval(result) {
60
+ return result.strategy === 'composition-match' ? 100 : 0;
61
+ }
62
+
63
+ function compositeScore({ structural, coverage, retrieval }) {
64
+ const components = [
65
+ { name: 'structural', value: structural, weight: 30 },
66
+ { name: 'coverage', value: coverage, weight: 20 },
67
+ { name: 'retrieval', value: retrieval, weight: 20 },
68
+ ].filter((c) => c.value !== null);
69
+
70
+ const totalWeight = components.reduce((s, c) => s + c.weight, 0);
71
+ const weighted = components.reduce((s, c) => s + c.value * c.weight, 0);
72
+ return totalWeight > 0 ? Math.round(weighted / totalWeight) : 0;
73
+ }
74
+
75
+ // ── Main ──
76
+
77
+ async function main() {
78
+ const { generateZettel } = await import('../../compose/strategies/zettel/generator-adapter.js');
79
+ const intents = loadHoldOut();
80
+ const llmAdapter = await buildLLMAdapter();
81
+
82
+ const results = [];
83
+ for (const intent of intents) {
84
+ process.stderr.write(`▶ ${intent.id} ${intent.kind.padEnd(7)} ${intent.intent.slice(0, 50)}...\n`);
85
+ const t0 = performance.now();
86
+ const result = await generateZettel({ intent: intent.intent, mode: 'instant', llmAdapter });
87
+ const elapsedMs = Math.round(performance.now() - t0);
88
+
89
+ const structural = scoreStructural(result);
90
+ const coverage = scoreCoverage(result, intent.expected_components);
91
+ const retrieval = scoreRetrieval(result);
92
+ const score = compositeScore({ structural, coverage, retrieval });
93
+
94
+ results.push({
95
+ id: intent.id,
96
+ kind: intent.kind,
97
+ strategy: result.strategy,
98
+ structural,
99
+ coverage,
100
+ retrieval,
101
+ score,
102
+ elapsedMs,
103
+ });
104
+ }
105
+
106
+ const avg = Math.round(results.reduce((s, r) => s + r.score, 0) / results.length);
107
+ const passing = results.filter((r) => r.score >= 80).length;
108
+
109
+ console.log(`# zettel baseline — ${FLAG_REAL_LLM ? 'real LLM' : 'stub (retrieval only)'}`);
110
+ console.log('');
111
+ console.log(`Aggregate: avg **${avg}**, passing **${passing} / ${results.length}** (threshold 80).`);
112
+ console.log('');
113
+ console.log('| ID | Kind | Strategy | Struct | Cov | Retr | Score | ms |');
114
+ console.log('|---|---|---|---:|---:|---:|---:|---:|');
115
+ for (const r of results) {
116
+ console.log(`| ${r.id} | ${r.kind} | ${r.strategy} | ${r.structural} | ${r.coverage} | ${r.retrieval} | **${r.score}** | ${r.elapsedMs} |`);
117
+ }
118
+ }
119
+
120
+ main().catch((err) => {
121
+ console.error('zettel-baseline failed:', err.message);
122
+ process.exit(2);
123
+ });