npm - @adia-ai/a2ui-mcp - Versions diffs - 0.1.2 → 0.1.3 - Mend

@adia-ai/a2ui-mcp 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/CHANGELOG.md +38 -0
package/evals/compose-from-chunks-holdout.jsonl +20 -0
package/package.json +2 -1
package/scripts/eval-compose-from-chunks.mjs +264 -0
package/scripts/generate.mjs +2 -2

package/CHANGELOG.md CHANGED Viewed

@@ -9,6 +9,44 @@ zettel strategies.
 ## [Unreleased]
+_No pending changes._
+## [0.1.3] - 2026-05-02
+Additive — `compose_from_chunks` eval runner + hold-out set. No
+BREAKING changes.
+### Added
+- **`compose_from_chunks` eval runner + hold-out set**. Closes the
+  spec at
+  [`docs/specs/compose-from-chunks-eval.md`](../../../docs/specs/compose-from-chunks-eval.md):
+  - `evals/compose-from-chunks-holdout.jsonl` — 20 intents (10
+    single-turn compose + 10 multi-turn refine) across 9
+    categories (data-display, forms, layout, data, data-viz,
+    agent, overlay, chat, settings, display).
+  - `scripts/eval-compose-from-chunks.mjs` — runner with `--stub`
+    (default; retrieval-only, no API calls) and `--real-llm`
+    (wires Anthropic SDK) modes. Composite scoring (structural
+    30 + coverage 20 + retrieval 20 + render 30, with null-
+    pro-rata redistribution when render is deferred). `--json`
+    + `--report-file` outputs. Threshold 80; exits 1 if avg < 80.
+  - `npm run eval:compose-from-chunks` exposes the runner.
+  - Stub baseline: 9 of 20 intents retrievable directly (avg
+    ~50-70 each); 11 need synthesis (correctly warn without
+    `--real-llm`). Aggregate ~26 / 100 stub. Real-LLM mode is
+    the gating signal for the `chunk-zettel` engine promotion.
+  Render-fidelity component is DEFERRED — needs Playwright
+  headless render + console-error capture wiring; current
+  composite drops the render weight pro-rata across
+  structural/coverage/retrieval until that lands.
+### Changed
+- `package.json` `files:` array now includes `evals/` so the hold-out
+  set ships in the published tarball.
 ---
 ## [0.1.2] - 2026-05-01

package/evals/compose-from-chunks-holdout.jsonl ADDED Viewed

@@ -0,0 +1,20 @@
+{"id":"intent-001","kind":"compose","category":"data-display","intent":"kpi grid with 4 stat cards: users, revenue, sessions, churn","expected_components":["Card","Stat","Grid"],"expected_chunk":"kpi-grid-4-card"}
+{"id":"intent-002","kind":"compose","category":"forms","intent":"sign-in form with email + password + 'forgot password' link","expected_components":["Card","Input","Button","Field"],"expected_chunk":"auth-sign-in"}
+{"id":"intent-003","kind":"compose","category":"layout","intent":"settings page with three tabs (general, integrations, billing)","expected_components":["Tabs","Tab","Card","Section"],"expected_chunk":"settings-tabs-3"}
+{"id":"intent-004","kind":"compose","category":"data","intent":"data table of users with role badge + last-active timestamp","expected_components":["Table","Badge"],"expected_chunk":"users-table"}
+{"id":"intent-005","kind":"compose","category":"data-viz","intent":"conversion funnel chart over 6 stages, with drop-off labels","expected_components":["Chart","Card","ChartLegend"],"expected_chunk":"conversion-funnel"}
+{"id":"intent-006","kind":"compose","category":"agent","intent":"agent activity feed with reasoning steps + final artifact","expected_components":["AgentTrace","AgentReasoning","AgentArtifact"],"expected_chunk":"agent-activity-feed"}
+{"id":"intent-007","kind":"compose","category":"layout","intent":"split-pane editor: code on the left, preview on the right","expected_components":["EditorShell","Pane","Code"],"expected_chunk":"editor-split"}
+{"id":"intent-008","kind":"compose","category":"overlay","intent":"command palette modal with grouped results (recent, suggestions)","expected_components":["Command","Modal"],"expected_chunk":"command-grouped"}
+{"id":"intent-009","kind":"compose","category":"forms","intent":"registration step 2 of 5 — profile setup with 4 fields","expected_components":["Card","StepProgress","Field","Input"],"expected_chunk":"reg-step-shell"}
+{"id":"intent-010","kind":"compose","category":"layout","intent":"404 error page with breadcrumb + back-to-home link","expected_components":["Card","Breadcrumb","Button"],"expected_chunk":"error-404"}
+{"id":"intent-011","kind":"refine","category":"data-display","intent":"dashboard for project metrics","refine":"add a date-range filter at the top","expected_components":["Card","Stat","Select"],"expected_chunk":"project-dashboard"}
+{"id":"intent-012","kind":"refine","category":"display","intent":"user profile card","refine":"make the email editable inline","expected_components":["Card","Avatar","Input"],"expected_chunk":"user-profile-card"}
+{"id":"intent-013","kind":"refine","category":"data","intent":"kanban board with 3 columns","refine":"add a count badge to each column header","expected_components":["Card","Badge","Header"],"expected_chunk":"kanban-3col"}
+{"id":"intent-014","kind":"refine","category":"chat","intent":"chat surface with streaming reply","refine":"add a stop button while streaming","expected_components":["ChatShell","Button","ChatInput"],"expected_chunk":"chat-streaming"}
+{"id":"intent-015","kind":"refine","category":"forms","intent":"sign-up form with email + password","refine":"add password strength meter","expected_components":["Card","Input","Progress"],"expected_chunk":"auth-sign-up"}
+{"id":"intent-016","kind":"refine","category":"settings","intent":"settings tab for notifications","refine":"split email + push into separate sections","expected_components":["Card","Section","Switch"],"expected_chunk":"settings-notifications"}
+{"id":"intent-017","kind":"refine","category":"data","intent":"table of orders","refine":"add a bulk-action toolbar above the table","expected_components":["Table","TableToolbar","Button"],"expected_chunk":"orders-table"}
+{"id":"intent-018","kind":"refine","category":"agent","intent":"agent reasoning panel","refine":"collapse intermediate steps by default, expandable","expected_components":["AgentReasoning","Accordion"],"expected_chunk":"agent-reasoning-collapsed"}
+{"id":"intent-019","kind":"refine","category":"overlay","intent":"modal confirming destructive action","refine":"require typing the resource name to confirm","expected_components":["Modal","Input","Button"],"expected_chunk":"destructive-confirm"}
+{"id":"intent-020","kind":"refine","category":"display","intent":"marketing landing hero","refine":"add a secondary 'see demo' CTA","expected_components":["Card","Heading","Button"],"expected_chunk":"marketing-hero"}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@adia-ai/a2ui-mcp",
-  "version": "0.1.2",
+  "version": "0.1.3",
   "description": "AdiaUI A2UI MCP server. Exposes the compose engine over MCP with an engine selector for monolithic + zettel strategies.",
   "type": "module",
   "bin": {
@@ -10,6 +10,7 @@
     "server.js",
     "tools/",
     "scripts/",
+    "evals/",
     "personas/",
     "README.md",
     "CHANGELOG.md"

package/scripts/eval-compose-from-chunks.mjs ADDED Viewed

@@ -0,0 +1,264 @@
+#!/usr/bin/env node
+/**
+ * eval-compose-from-chunks.mjs — Hold-out eval for the chunk-aware
+ * synthesizer. Per `docs/specs/compose-from-chunks-eval.md`.
+ *
+ * Reads `packages/a2ui/mcp/evals/compose-from-chunks-holdout.jsonl`,
+ * runs each intent through `composeFromIntent`, and emits a per-intent
+ * + aggregate report.
+ *
+ * Two modes:
+ *   --stub      (default) — retrieval-only path, no LLM calls. Fast;
+ *                produces a baseline that exercises the chunk-corpus +
+ *                retrieval scoring without spending API budget. Use to
+ *                verify the runner shape + the corpus surface.
+ *   --real-llm  — wires the LLM bridge for synthesis-tier composition.
+ *                Costs ~$2 per full eval at Claude Sonnet 4.6 pricing.
+ *                Requires `ANTHROPIC_API_KEY` in env.
+ *
+ * Scoring (composite, 0-100):
+ *   - Structural (30%) — composition emitted (html non-null OR plan
+ *     non-null).
+ *   - Coverage   (20%) — at least one of the intent's
+ *     `expected_components` appears in the emitted markup.
+ *   - Retrieval  (20%) — top-k retrieved chunks include the intent's
+ *     `expected_chunk` (when set). Soft-asserted; absence flags but
+ *     doesn't fail.
+ *   - Render fidelity (30%) — DEFERRED. Real implementation needs
+ *     Playwright headless render + console-error capture; this runner
+ *     emits a placeholder pending the render-fidelity smoke wiring.
+ *     The composite re-distributes its weight pro-rata across the
+ *     three remaining components when render is null.
+ *
+ * Exit:
+ *   0 if avg ≥ 80 (passes the chunk-zettel promotion gate threshold)
+ *   1 if avg < 80
+ *
+ * Usage:
+ *   npm run eval:compose-from-chunks                  # stub mode
+ *   npm run eval:compose-from-chunks -- --real-llm    # real-LLM
+ *   npm run eval:compose-from-chunks -- --limit 5     # first 5 intents
+ *   npm run eval:compose-from-chunks -- --json        # JSON report
+ *   npm run eval:compose-from-chunks -- --report-file # write to docs/reports/
+ */
+import '../../../../scripts/load-env.mjs';
+import fs from 'node:fs';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { composeFromIntent } from '../../compose/engines/zettel/chunk-synthesizer.js';
+import { searchChunksAsync } from '../../corpus/scripts/chunk-library.js';
+const REPO_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../../../..');
+const HOLDOUT = path.join(REPO_ROOT, 'packages/a2ui/mcp/evals/compose-from-chunks-holdout.jsonl');
+const PASS_THRESHOLD = 80;
+const args = process.argv.slice(2);
+const FLAG_REAL_LLM = args.includes('--real-llm');
+const FLAG_JSON     = args.includes('--json');
+const FLAG_REPORT   = args.includes('--report-file');
+const limitIdx = args.indexOf('--limit');
+const LIMIT    = limitIdx >= 0 ? parseInt(args[limitIdx + 1], 10) : null;
+// ─────────────────────────────────────────────────────────────────
+// Hold-out loader
+// ─────────────────────────────────────────────────────────────────
+function loadHoldOut() {
+  const raw = fs.readFileSync(HOLDOUT, 'utf8');
+  const lines = raw.split('\n').map((l) => l.trim()).filter(Boolean);
+  const intents = lines.map((l) => JSON.parse(l));
+  return LIMIT ? intents.slice(0, LIMIT) : intents;
+}
+// ─────────────────────────────────────────────────────────────────
+// LLM adapter
+// ─────────────────────────────────────────────────────────────────
+async function buildLLMAdapter() {
+  if (!FLAG_REAL_LLM) return null;
+  const apiKey = process.env.ANTHROPIC_API_KEY;
+  if (!apiKey) {
+    console.error('--real-llm requires ANTHROPIC_API_KEY in env.');
+    process.exit(2);
+  }
+  const { default: Anthropic } = await import('@anthropic-ai/sdk');
+  const client = new Anthropic({ apiKey });
+  return {
+    async complete({ system, user, model = 'claude-sonnet-4-6', maxTokens = 2048 }) {
+      const resp = await client.messages.create({
+        model,
+        max_tokens: maxTokens,
+        system,
+        messages: [{ role: 'user', content: user }],
+      });
+      const text = resp.content
+        .filter((b) => b.type === 'text')
+        .map((b) => b.text)
+        .join('');
+      return { text };
+    },
+  };
+}
+// ─────────────────────────────────────────────────────────────────
+// Scoring
+// ─────────────────────────────────────────────────────────────────
+function scoreStructural(result) {
+  if (result.html || result.plan) return 100;
+  return 0;
+}
+function scoreCoverage(result, expected) {
+  if (!expected || expected.length === 0) return 100;
+  const html = String(result.html || '');
+  let hits = 0;
+  for (const tag of expected) {
+    const lowered = tag.toLowerCase().replace(/^ui/, '');
+    const re = new RegExp(`<${lowered}-ui[\\s>]`, 'i');
+    if (re.test(html)) hits++;
+  }
+  return Math.round((hits / expected.length) * 100);
+}
+async function scoreRetrieval(intent, expectedChunk) {
+  if (!expectedChunk) return null;
+  const hits = await searchChunksAsync(intent, { limit: 5 });
+  const found = hits.some((h) => h.name === expectedChunk);
+  return found ? 100 : 0;
+}
+function scoreRenderFidelity(_result) {
+  // DEFERRED — Playwright headless render + console-error capture.
+  // See spec § Out-of-band for the follow-up.
+  return null;
+}
+function compositeScore({ structural, coverage, retrieval, render }) {
+  const components = [
+    { name: 'structural', value: structural, weight: 30 },
+    { name: 'coverage',   value: coverage,   weight: 20 },
+    { name: 'retrieval',  value: retrieval,  weight: 20 },
+    { name: 'render',     value: render,     weight: 30 },
+  ].filter((c) => c.value !== null);
+  const totalWeight = components.reduce((s, c) => s + c.weight, 0);
+  const weighted = components.reduce((s, c) => s + c.value * c.weight, 0);
+  return totalWeight > 0 ? Math.round(weighted / totalWeight) : 0;
+}
+// ─────────────────────────────────────────────────────────────────
+// Eval loop
+// ─────────────────────────────────────────────────────────────────
+async function evalIntent(intent, llmAdapter) {
+  const t0 = performance.now();
+  const result = await composeFromIntent({
+    intent: intent.intent,
+    llmAdapter,
+    maxAttempts: 2,
+  });
+  const elapsedMs = Math.round(performance.now() - t0);
+  const structural = scoreStructural(result);
+  const coverage = scoreCoverage(result, intent.expected_components);
+  const retrieval = await scoreRetrieval(intent.intent, intent.expected_chunk);
+  const render = scoreRenderFidelity(result);
+  const score = compositeScore({ structural, coverage, retrieval, render });
+  return {
+    id: intent.id,
+    kind: intent.kind,
+    intent: intent.intent,
+    source: result.source,
+    elapsedMs,
+    structural,
+    coverage,
+    retrieval,
+    render,
+    score,
+    warnings: result.warnings ?? [],
+  };
+}
+// ─────────────────────────────────────────────────────────────────
+// Reporter
+// ─────────────────────────────────────────────────────────────────
+function reportText(results, mode) {
+  const avg = Math.round(results.reduce((s, r) => s + r.score, 0) / results.length);
+  const passing = results.filter((r) => r.score >= PASS_THRESHOLD).length;
+  const out = [];
+  out.push(`# compose_from_chunks eval — ${mode === 'real-llm' ? 'real LLM' : 'stub (retrieval only)'}`);
+  out.push('');
+  out.push(`Aggregate: avg **${avg}**, passing **${passing} / ${results.length}** (threshold ${PASS_THRESHOLD}).`);
+  out.push('');
+  out.push('| ID | Kind | Source | Struct | Cov | Retr | Render | Score | ms |');
+  out.push('|---|---|---|---:|---:|---:|---:|---:|---:|');
+  for (const r of results) {
+    out.push(`| ${r.id} | ${r.kind} | ${r.source ?? '—'} | ${r.structural} | ${r.coverage} | ${r.retrieval ?? '—'} | ${r.render ?? '—'} | **${r.score}** | ${r.elapsedMs} |`);
+  }
+  if (results.some((r) => r.warnings.length > 0)) {
+    out.push('');
+    out.push('## Warnings');
+    for (const r of results) {
+      if (r.warnings.length === 0) continue;
+      out.push(`- **${r.id}**: ${r.warnings.join('; ')}`);
+    }
+  }
+  return out.join('\n');
+}
+function reportJSON(results, mode) {
+  const avg = Math.round(results.reduce((s, r) => s + r.score, 0) / results.length);
+  const passing = results.filter((r) => r.score >= PASS_THRESHOLD).length;
+  return JSON.stringify({
+    mode,
+    threshold: PASS_THRESHOLD,
+    avg,
+    passing,
+    total: results.length,
+    results,
+  }, null, 2);
+}
+// ─────────────────────────────────────────────────────────────────
+// Main
+// ─────────────────────────────────────────────────────────────────
+async function main() {
+  const intents = loadHoldOut();
+  const llmAdapter = await buildLLMAdapter();
+  const mode = FLAG_REAL_LLM ? 'real-llm' : 'stub';
+  const results = [];
+  for (const intent of intents) {
+    process.stderr.write(`▶ ${intent.id} ${intent.kind.padEnd(7)} ${intent.intent.slice(0, 50)}...\n`);
+    const result = await evalIntent(intent, llmAdapter);
+    results.push(result);
+  }
+  const output = FLAG_JSON ? reportJSON(results, mode) : reportText(results, mode);
+  if (FLAG_REPORT) {
+    const date = new Date().toISOString().slice(0, 10);
+    const dir = path.join(REPO_ROOT, 'docs/reports');
+    if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
+    const file = path.join(dir, `eval-compose-from-chunks-${date}.md`);
+    fs.writeFileSync(file, output + '\n');
+    console.error(`\nReport written to ${path.relative(REPO_ROOT, file)}`);
+  } else {
+    console.log(output);
+  }
+  const avg = results.reduce((s, r) => s + r.score, 0) / results.length;
+  process.exit(avg >= PASS_THRESHOLD ? 0 : 1);
+}
+main().catch((err) => {
+  console.error('eval-compose-from-chunks failed:', err.message);
+  process.exit(2);
+});

package/scripts/generate.mjs CHANGED Viewed

@@ -138,7 +138,7 @@ function componentsToHTML(comps) {
     Stat: 'stat-ui', Table: 'table-ui', Chart: 'chart-ui',
     List: 'list-ui', Pagination: 'pagination-ui',
     // Navigation
-    Tabs: 'tabs-ui', Tab: 'tab-ui', Nav: 'nav-n',
+    Tabs: 'tabs-ui', Tab: 'tab-ui', Nav: 'nav-ui',
     Breadcrumb: 'breadcrumb-ui', SegmentedControl: 'segmented-ui', Segment: 'segment-ui',
     // Overlay
     Modal: 'modal-ui', Drawer: 'drawer-ui', Popover: 'popover-ui',
@@ -170,7 +170,7 @@ function componentsToHTML(comps) {
       return `${indent}<${tag}${attrStr} nomargin>${c.textContent || ''}</${tag}>`;
     }
-    const tag = TAG_MAP[c.component] || c.component.toLowerCase() + '-n';
+    const tag = TAG_MAP[c.component] || c.component.toLowerCase() + '-ui';
     const skip = new Set(['id', 'component', 'children', 'textContent']);
     const attrs = Object.entries(c)
       .filter(([k]) => !skip.has(k))