@adia-ai/a2ui-mcp 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -9,6 +9,44 @@ zettel strategies.
9
9
 
10
10
  ## [Unreleased]
11
11
 
12
+ _No pending changes._
13
+
14
+ ## [0.1.3] - 2026-05-02
15
+
16
+ Additive — `compose_from_chunks` eval runner + hold-out set. No
17
+ BREAKING changes.
18
+
19
+ ### Added
20
+
21
+ - **`compose_from_chunks` eval runner + hold-out set**. Closes the
22
+ spec at
23
+ [`docs/specs/compose-from-chunks-eval.md`](../../../docs/specs/compose-from-chunks-eval.md):
24
+ - `evals/compose-from-chunks-holdout.jsonl` — 20 intents (10
25
+ single-turn compose + 10 multi-turn refine) across 9
26
+ categories (data-display, forms, layout, data, data-viz,
27
+ agent, overlay, chat, settings, display).
28
+ - `scripts/eval-compose-from-chunks.mjs` — runner with `--stub`
29
+ (default; retrieval-only, no API calls) and `--real-llm`
30
+ (wires Anthropic SDK) modes. Composite scoring (structural
31
+ 30 + coverage 20 + retrieval 20 + render 30, with null-
32
+ pro-rata redistribution when render is deferred). `--json`
33
+ + `--report-file` outputs. Threshold 80; exits 1 if avg < 80.
34
+ - `npm run eval:compose-from-chunks` exposes the runner.
35
+ - Stub baseline: 9 of 20 intents retrievable directly (avg
36
+ ~50-70 each); 11 need synthesis (correctly warn without
37
+ `--real-llm`). Aggregate ~26 / 100 stub. Real-LLM mode is
38
+ the gating signal for the `chunk-zettel` engine promotion.
39
+
40
+ Render-fidelity component is DEFERRED — needs Playwright
41
+ headless render + console-error capture wiring; current
42
+ composite drops the render weight pro-rata across
43
+ structural/coverage/retrieval until that lands.
44
+
45
+ ### Changed
46
+
47
+ - `package.json` `files:` array now includes `evals/` so the hold-out
48
+ set ships in the published tarball.
49
+
12
50
  ---
13
51
 
14
52
  ## [0.1.2] - 2026-05-01
@@ -0,0 +1,20 @@
1
+ {"id":"intent-001","kind":"compose","category":"data-display","intent":"kpi grid with 4 stat cards: users, revenue, sessions, churn","expected_components":["Card","Stat","Grid"],"expected_chunk":"kpi-grid-4-card"}
2
+ {"id":"intent-002","kind":"compose","category":"forms","intent":"sign-in form with email + password + 'forgot password' link","expected_components":["Card","Input","Button","Field"],"expected_chunk":"auth-sign-in"}
3
+ {"id":"intent-003","kind":"compose","category":"layout","intent":"settings page with three tabs (general, integrations, billing)","expected_components":["Tabs","Tab","Card","Section"],"expected_chunk":"settings-tabs-3"}
4
+ {"id":"intent-004","kind":"compose","category":"data","intent":"data table of users with role badge + last-active timestamp","expected_components":["Table","Badge"],"expected_chunk":"users-table"}
5
+ {"id":"intent-005","kind":"compose","category":"data-viz","intent":"conversion funnel chart over 6 stages, with drop-off labels","expected_components":["Chart","Card","ChartLegend"],"expected_chunk":"conversion-funnel"}
6
+ {"id":"intent-006","kind":"compose","category":"agent","intent":"agent activity feed with reasoning steps + final artifact","expected_components":["AgentTrace","AgentReasoning","AgentArtifact"],"expected_chunk":"agent-activity-feed"}
7
+ {"id":"intent-007","kind":"compose","category":"layout","intent":"split-pane editor: code on the left, preview on the right","expected_components":["EditorShell","Pane","Code"],"expected_chunk":"editor-split"}
8
+ {"id":"intent-008","kind":"compose","category":"overlay","intent":"command palette modal with grouped results (recent, suggestions)","expected_components":["Command","Modal"],"expected_chunk":"command-grouped"}
9
+ {"id":"intent-009","kind":"compose","category":"forms","intent":"registration step 2 of 5 — profile setup with 4 fields","expected_components":["Card","StepProgress","Field","Input"],"expected_chunk":"reg-step-shell"}
10
+ {"id":"intent-010","kind":"compose","category":"layout","intent":"404 error page with breadcrumb + back-to-home link","expected_components":["Card","Breadcrumb","Button"],"expected_chunk":"error-404"}
11
+ {"id":"intent-011","kind":"refine","category":"data-display","intent":"dashboard for project metrics","refine":"add a date-range filter at the top","expected_components":["Card","Stat","Select"],"expected_chunk":"project-dashboard"}
12
+ {"id":"intent-012","kind":"refine","category":"display","intent":"user profile card","refine":"make the email editable inline","expected_components":["Card","Avatar","Input"],"expected_chunk":"user-profile-card"}
13
+ {"id":"intent-013","kind":"refine","category":"data","intent":"kanban board with 3 columns","refine":"add a count badge to each column header","expected_components":["Card","Badge","Header"],"expected_chunk":"kanban-3col"}
14
+ {"id":"intent-014","kind":"refine","category":"chat","intent":"chat surface with streaming reply","refine":"add a stop button while streaming","expected_components":["ChatShell","Button","ChatInput"],"expected_chunk":"chat-streaming"}
15
+ {"id":"intent-015","kind":"refine","category":"forms","intent":"sign-up form with email + password","refine":"add password strength meter","expected_components":["Card","Input","Progress"],"expected_chunk":"auth-sign-up"}
16
+ {"id":"intent-016","kind":"refine","category":"settings","intent":"settings tab for notifications","refine":"split email + push into separate sections","expected_components":["Card","Section","Switch"],"expected_chunk":"settings-notifications"}
17
+ {"id":"intent-017","kind":"refine","category":"data","intent":"table of orders","refine":"add a bulk-action toolbar above the table","expected_components":["Table","TableToolbar","Button"],"expected_chunk":"orders-table"}
18
+ {"id":"intent-018","kind":"refine","category":"agent","intent":"agent reasoning panel","refine":"collapse intermediate steps by default, expandable","expected_components":["AgentReasoning","Accordion"],"expected_chunk":"agent-reasoning-collapsed"}
19
+ {"id":"intent-019","kind":"refine","category":"overlay","intent":"modal confirming destructive action","refine":"require typing the resource name to confirm","expected_components":["Modal","Input","Button"],"expected_chunk":"destructive-confirm"}
20
+ {"id":"intent-020","kind":"refine","category":"display","intent":"marketing landing hero","refine":"add a secondary 'see demo' CTA","expected_components":["Card","Heading","Button"],"expected_chunk":"marketing-hero"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adia-ai/a2ui-mcp",
3
- "version": "0.1.2",
3
+ "version": "0.1.3",
4
4
  "description": "AdiaUI A2UI MCP server. Exposes the compose engine over MCP with an engine selector for monolithic + zettel strategies.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -10,6 +10,7 @@
10
10
  "server.js",
11
11
  "tools/",
12
12
  "scripts/",
13
+ "evals/",
13
14
  "personas/",
14
15
  "README.md",
15
16
  "CHANGELOG.md"
@@ -0,0 +1,264 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * eval-compose-from-chunks.mjs — Hold-out eval for the chunk-aware
4
+ * synthesizer. Per `docs/specs/compose-from-chunks-eval.md`.
5
+ *
6
+ * Reads `packages/a2ui/mcp/evals/compose-from-chunks-holdout.jsonl`,
7
+ * runs each intent through `composeFromIntent`, and emits a per-intent
8
+ * + aggregate report.
9
+ *
10
+ * Two modes:
11
+ * --stub (default) — retrieval-only path, no LLM calls. Fast;
12
+ * produces a baseline that exercises the chunk-corpus +
13
+ * retrieval scoring without spending API budget. Use to
14
+ * verify the runner shape + the corpus surface.
15
+ * --real-llm — wires the LLM bridge for synthesis-tier composition.
16
+ * Costs ~$2 per full eval at Claude Sonnet 4.6 pricing.
17
+ * Requires `ANTHROPIC_API_KEY` in env.
18
+ *
19
+ * Scoring (composite, 0-100):
20
+ * - Structural (30%) — composition emitted (html non-null OR plan
21
+ * non-null).
22
+ * - Coverage (20%) — at least one of the intent's
23
+ * `expected_components` appears in the emitted markup.
24
+ * - Retrieval (20%) — top-k retrieved chunks include the intent's
25
+ * `expected_chunk` (when set). Soft-asserted; absence flags but
26
+ * doesn't fail.
27
+ * - Render fidelity (30%) — DEFERRED. Real implementation needs
28
+ * Playwright headless render + console-error capture; this runner
29
+ * emits a placeholder pending the render-fidelity smoke wiring.
30
+ * The composite re-distributes its weight pro-rata across the
31
+ * three remaining components when render is null.
32
+ *
33
+ * Exit:
34
+ * 0 if avg ≥ 80 (passes the chunk-zettel promotion gate threshold)
35
+ * 1 if avg < 80
36
+ *
37
+ * Usage:
38
+ * npm run eval:compose-from-chunks # stub mode
39
+ * npm run eval:compose-from-chunks -- --real-llm # real-LLM
40
+ * npm run eval:compose-from-chunks -- --limit 5 # first 5 intents
41
+ * npm run eval:compose-from-chunks -- --json # JSON report
42
+ * npm run eval:compose-from-chunks -- --report-file # write to docs/reports/
43
+ */
44
+
45
+ import '../../../../scripts/load-env.mjs';
46
+
47
+ import fs from 'node:fs';
48
+ import path from 'node:path';
49
+ import { fileURLToPath } from 'node:url';
50
+
51
+ import { composeFromIntent } from '../../compose/engines/zettel/chunk-synthesizer.js';
52
+ import { searchChunksAsync } from '../../corpus/scripts/chunk-library.js';
53
+
54
+ const REPO_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../../../..');
55
+ const HOLDOUT = path.join(REPO_ROOT, 'packages/a2ui/mcp/evals/compose-from-chunks-holdout.jsonl');
56
+ const PASS_THRESHOLD = 80;
57
+
58
+ const args = process.argv.slice(2);
59
+ const FLAG_REAL_LLM = args.includes('--real-llm');
60
+ const FLAG_JSON = args.includes('--json');
61
+ const FLAG_REPORT = args.includes('--report-file');
62
+ const limitIdx = args.indexOf('--limit');
63
+ const LIMIT = limitIdx >= 0 ? parseInt(args[limitIdx + 1], 10) : null;
64
+
65
+ // ─────────────────────────────────────────────────────────────────
66
+ // Hold-out loader
67
+ // ─────────────────────────────────────────────────────────────────
68
+
69
+ function loadHoldOut() {
70
+ const raw = fs.readFileSync(HOLDOUT, 'utf8');
71
+ const lines = raw.split('\n').map((l) => l.trim()).filter(Boolean);
72
+ const intents = lines.map((l) => JSON.parse(l));
73
+ return LIMIT ? intents.slice(0, LIMIT) : intents;
74
+ }
75
+
76
+ // ─────────────────────────────────────────────────────────────────
77
+ // LLM adapter
78
+ // ─────────────────────────────────────────────────────────────────
79
+
80
+ async function buildLLMAdapter() {
81
+ if (!FLAG_REAL_LLM) return null;
82
+ const apiKey = process.env.ANTHROPIC_API_KEY;
83
+ if (!apiKey) {
84
+ console.error('--real-llm requires ANTHROPIC_API_KEY in env.');
85
+ process.exit(2);
86
+ }
87
+ const { default: Anthropic } = await import('@anthropic-ai/sdk');
88
+ const client = new Anthropic({ apiKey });
89
+ return {
90
+ async complete({ system, user, model = 'claude-sonnet-4-6', maxTokens = 2048 }) {
91
+ const resp = await client.messages.create({
92
+ model,
93
+ max_tokens: maxTokens,
94
+ system,
95
+ messages: [{ role: 'user', content: user }],
96
+ });
97
+ const text = resp.content
98
+ .filter((b) => b.type === 'text')
99
+ .map((b) => b.text)
100
+ .join('');
101
+ return { text };
102
+ },
103
+ };
104
+ }
105
+
106
+ // ─────────────────────────────────────────────────────────────────
107
+ // Scoring
108
+ // ─────────────────────────────────────────────────────────────────
109
+
110
+ function scoreStructural(result) {
111
+ if (result.html || result.plan) return 100;
112
+ return 0;
113
+ }
114
+
115
+ function scoreCoverage(result, expected) {
116
+ if (!expected || expected.length === 0) return 100;
117
+ const html = String(result.html || '');
118
+ let hits = 0;
119
+ for (const tag of expected) {
120
+ const lowered = tag.toLowerCase().replace(/^ui/, '');
121
+ const re = new RegExp(`<${lowered}-ui[\\s>]`, 'i');
122
+ if (re.test(html)) hits++;
123
+ }
124
+ return Math.round((hits / expected.length) * 100);
125
+ }
126
+
127
+ async function scoreRetrieval(intent, expectedChunk) {
128
+ if (!expectedChunk) return null;
129
+ const hits = await searchChunksAsync(intent, { limit: 5 });
130
+ const found = hits.some((h) => h.name === expectedChunk);
131
+ return found ? 100 : 0;
132
+ }
133
+
134
+ function scoreRenderFidelity(_result) {
135
+ // DEFERRED — Playwright headless render + console-error capture.
136
+ // See spec § Out-of-band for the follow-up.
137
+ return null;
138
+ }
139
+
140
+ function compositeScore({ structural, coverage, retrieval, render }) {
141
+ const components = [
142
+ { name: 'structural', value: structural, weight: 30 },
143
+ { name: 'coverage', value: coverage, weight: 20 },
144
+ { name: 'retrieval', value: retrieval, weight: 20 },
145
+ { name: 'render', value: render, weight: 30 },
146
+ ].filter((c) => c.value !== null);
147
+
148
+ const totalWeight = components.reduce((s, c) => s + c.weight, 0);
149
+ const weighted = components.reduce((s, c) => s + c.value * c.weight, 0);
150
+ return totalWeight > 0 ? Math.round(weighted / totalWeight) : 0;
151
+ }
152
+
153
+ // ─────────────────────────────────────────────────────────────────
154
+ // Eval loop
155
+ // ─────────────────────────────────────────────────────────────────
156
+
157
+ async function evalIntent(intent, llmAdapter) {
158
+ const t0 = performance.now();
159
+ const result = await composeFromIntent({
160
+ intent: intent.intent,
161
+ llmAdapter,
162
+ maxAttempts: 2,
163
+ });
164
+ const elapsedMs = Math.round(performance.now() - t0);
165
+
166
+ const structural = scoreStructural(result);
167
+ const coverage = scoreCoverage(result, intent.expected_components);
168
+ const retrieval = await scoreRetrieval(intent.intent, intent.expected_chunk);
169
+ const render = scoreRenderFidelity(result);
170
+
171
+ const score = compositeScore({ structural, coverage, retrieval, render });
172
+ return {
173
+ id: intent.id,
174
+ kind: intent.kind,
175
+ intent: intent.intent,
176
+ source: result.source,
177
+ elapsedMs,
178
+ structural,
179
+ coverage,
180
+ retrieval,
181
+ render,
182
+ score,
183
+ warnings: result.warnings ?? [],
184
+ };
185
+ }
186
+
187
+ // ─────────────────────────────────────────────────────────────────
188
+ // Reporter
189
+ // ─────────────────────────────────────────────────────────────────
190
+
191
+ function reportText(results, mode) {
192
+ const avg = Math.round(results.reduce((s, r) => s + r.score, 0) / results.length);
193
+ const passing = results.filter((r) => r.score >= PASS_THRESHOLD).length;
194
+ const out = [];
195
+ out.push(`# compose_from_chunks eval — ${mode === 'real-llm' ? 'real LLM' : 'stub (retrieval only)'}`);
196
+ out.push('');
197
+ out.push(`Aggregate: avg **${avg}**, passing **${passing} / ${results.length}** (threshold ${PASS_THRESHOLD}).`);
198
+ out.push('');
199
+ out.push('| ID | Kind | Source | Struct | Cov | Retr | Render | Score | ms |');
200
+ out.push('|---|---|---|---:|---:|---:|---:|---:|---:|');
201
+ for (const r of results) {
202
+ out.push(`| ${r.id} | ${r.kind} | ${r.source ?? '—'} | ${r.structural} | ${r.coverage} | ${r.retrieval ?? '—'} | ${r.render ?? '—'} | **${r.score}** | ${r.elapsedMs} |`);
203
+ }
204
+ if (results.some((r) => r.warnings.length > 0)) {
205
+ out.push('');
206
+ out.push('## Warnings');
207
+ for (const r of results) {
208
+ if (r.warnings.length === 0) continue;
209
+ out.push(`- **${r.id}**: ${r.warnings.join('; ')}`);
210
+ }
211
+ }
212
+ return out.join('\n');
213
+ }
214
+
215
+ function reportJSON(results, mode) {
216
+ const avg = Math.round(results.reduce((s, r) => s + r.score, 0) / results.length);
217
+ const passing = results.filter((r) => r.score >= PASS_THRESHOLD).length;
218
+ return JSON.stringify({
219
+ mode,
220
+ threshold: PASS_THRESHOLD,
221
+ avg,
222
+ passing,
223
+ total: results.length,
224
+ results,
225
+ }, null, 2);
226
+ }
227
+
228
+ // ─────────────────────────────────────────────────────────────────
229
+ // Main
230
+ // ─────────────────────────────────────────────────────────────────
231
+
232
+ async function main() {
233
+ const intents = loadHoldOut();
234
+ const llmAdapter = await buildLLMAdapter();
235
+ const mode = FLAG_REAL_LLM ? 'real-llm' : 'stub';
236
+
237
+ const results = [];
238
+ for (const intent of intents) {
239
+ process.stderr.write(`▶ ${intent.id} ${intent.kind.padEnd(7)} ${intent.intent.slice(0, 50)}...\n`);
240
+ const result = await evalIntent(intent, llmAdapter);
241
+ results.push(result);
242
+ }
243
+
244
+ const output = FLAG_JSON ? reportJSON(results, mode) : reportText(results, mode);
245
+
246
+ if (FLAG_REPORT) {
247
+ const date = new Date().toISOString().slice(0, 10);
248
+ const dir = path.join(REPO_ROOT, 'docs/reports');
249
+ if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
250
+ const file = path.join(dir, `eval-compose-from-chunks-${date}.md`);
251
+ fs.writeFileSync(file, output + '\n');
252
+ console.error(`\nReport written to ${path.relative(REPO_ROOT, file)}`);
253
+ } else {
254
+ console.log(output);
255
+ }
256
+
257
+ const avg = results.reduce((s, r) => s + r.score, 0) / results.length;
258
+ process.exit(avg >= PASS_THRESHOLD ? 0 : 1);
259
+ }
260
+
261
+ main().catch((err) => {
262
+ console.error('eval-compose-from-chunks failed:', err.message);
263
+ process.exit(2);
264
+ });
@@ -138,7 +138,7 @@ function componentsToHTML(comps) {
138
138
  Stat: 'stat-ui', Table: 'table-ui', Chart: 'chart-ui',
139
139
  List: 'list-ui', Pagination: 'pagination-ui',
140
140
  // Navigation
141
- Tabs: 'tabs-ui', Tab: 'tab-ui', Nav: 'nav-n',
141
+ Tabs: 'tabs-ui', Tab: 'tab-ui', Nav: 'nav-ui',
142
142
  Breadcrumb: 'breadcrumb-ui', SegmentedControl: 'segmented-ui', Segment: 'segment-ui',
143
143
  // Overlay
144
144
  Modal: 'modal-ui', Drawer: 'drawer-ui', Popover: 'popover-ui',
@@ -170,7 +170,7 @@ function componentsToHTML(comps) {
170
170
  return `${indent}<${tag}${attrStr} nomargin>${c.textContent || ''}</${tag}>`;
171
171
  }
172
172
 
173
- const tag = TAG_MAP[c.component] || c.component.toLowerCase() + '-n';
173
+ const tag = TAG_MAP[c.component] || c.component.toLowerCase() + '-ui';
174
174
  const skip = new Set(['id', 'component', 'children', 'textContent']);
175
175
  const attrs = Object.entries(c)
176
176
  .filter(([k]) => !skip.has(k))