@urbicon-ui/mcp-server 6.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. package/README.md +161 -0
  2. package/dist/data/catalog-loader.d.ts +37 -0
  3. package/dist/data/catalog-loader.d.ts.map +1 -0
  4. package/dist/data/catalog-loader.js +15 -0
  5. package/dist/data/catalog-loader.js.map +1 -0
  6. package/dist/data/component-loader.d.ts +2 -0
  7. package/dist/data/component-loader.d.ts.map +1 -0
  8. package/dist/data/component-loader.js +17 -0
  9. package/dist/data/component-loader.js.map +1 -0
  10. package/dist/data/recipe-loader.d.ts +4 -0
  11. package/dist/data/recipe-loader.d.ts.map +1 -0
  12. package/dist/data/recipe-loader.js +102 -0
  13. package/dist/data/recipe-loader.js.map +1 -0
  14. package/dist/data/template-loader.d.ts +8 -0
  15. package/dist/data/template-loader.d.ts.map +1 -0
  16. package/dist/data/template-loader.js +33 -0
  17. package/dist/data/template-loader.js.map +1 -0
  18. package/dist/index.d.ts +3 -0
  19. package/dist/index.d.ts.map +1 -0
  20. package/dist/index.js +57 -0
  21. package/dist/index.js.map +1 -0
  22. package/dist/resources/catalog.d.ts +3 -0
  23. package/dist/resources/catalog.d.ts.map +1 -0
  24. package/dist/resources/catalog.js +20 -0
  25. package/dist/resources/catalog.js.map +1 -0
  26. package/dist/resources/component.d.ts +3 -0
  27. package/dist/resources/component.d.ts.map +1 -0
  28. package/dist/resources/component.js +29 -0
  29. package/dist/resources/component.js.map +1 -0
  30. package/dist/resources/guides.d.ts +3 -0
  31. package/dist/resources/guides.d.ts.map +1 -0
  32. package/dist/resources/guides.js +36 -0
  33. package/dist/resources/guides.js.map +1 -0
  34. package/dist/server.d.ts +3 -0
  35. package/dist/server.d.ts.map +1 -0
  36. package/dist/server.js.map +1 -0
  37. package/dist/tools/find-components.d.ts +3 -0
  38. package/dist/tools/find-components.d.ts.map +1 -0
  39. package/dist/tools/find-components.js +21 -0
  40. package/dist/tools/find-components.js.map +1 -0
  41. package/dist/tools/get-recipe.d.ts +3 -0
  42. package/dist/tools/get-recipe.d.ts.map +1 -0
  43. package/dist/tools/get-recipe.js +48 -0
  44. package/dist/tools/get-recipe.js.map +1 -0
  45. package/dist/tools/suggest-implementation.d.ts +3 -0
  46. package/dist/tools/suggest-implementation.d.ts.map +1 -0
  47. package/dist/tools/suggest-implementation.js +178 -0
  48. package/dist/tools/suggest-implementation.js.map +1 -0
  49. package/dist/transports/http.d.ts +2 -0
  50. package/dist/transports/http.d.ts.map +1 -0
  51. package/dist/transports/http.js +77 -0
  52. package/dist/transports/http.js.map +1 -0
  53. package/dist/transports/stdio.d.ts +3 -0
  54. package/dist/transports/stdio.d.ts.map +1 -0
  55. package/dist/transports/stdio.js +6 -0
  56. package/dist/transports/stdio.js.map +1 -0
  57. package/dist/tsconfig.tsbuildinfo +1 -0
  58. package/dist/utils/format-catalog.d.ts +7 -0
  59. package/dist/utils/format-catalog.d.ts.map +1 -0
  60. package/dist/utils/format-catalog.js +93 -0
  61. package/dist/utils/format-catalog.js.map +1 -0
  62. package/dist/utils/paths.d.ts +7 -0
  63. package/dist/utils/paths.d.ts.map +1 -0
  64. package/dist/utils/paths.js +23 -0
  65. package/dist/utils/paths.js.map +1 -0
  66. package/dist/utils/search.d.ts +3 -0
  67. package/dist/utils/search.d.ts.map +1 -0
  68. package/dist/utils/search.js +44 -0
  69. package/dist/utils/search.js.map +1 -0
  70. package/package.json +42 -0
  71. package/src/data/catalog-loader.test.ts +42 -0
  72. package/src/data/catalog-loader.ts +78 -0
  73. package/src/data/component-loader.ts +68 -0
  74. package/src/data/design-system-loader.test.ts +82 -0
  75. package/src/data/design-system-loader.ts +125 -0
  76. package/src/data/icon-loader.test.ts +85 -0
  77. package/src/data/icon-loader.ts +90 -0
  78. package/src/data/recipe-loader.test.ts +49 -0
  79. package/src/data/recipe-loader.ts +131 -0
  80. package/src/data/template-loader.ts +55 -0
  81. package/src/design-linter/heuristics.ts +162 -0
  82. package/src/design-linter/index.ts +14 -0
  83. package/src/design-linter/linter.test.ts +257 -0
  84. package/src/design-linter/linter.ts +62 -0
  85. package/src/design-linter/rules.ts +348 -0
  86. package/src/design-linter/tokens.test.ts +80 -0
  87. package/src/design-linter/tokens.ts +203 -0
  88. package/src/design-linter/types.ts +66 -0
  89. package/src/design-manifest/index.ts +20 -0
  90. package/src/design-manifest/manifest.test.ts +175 -0
  91. package/src/design-manifest/manifest.ts +250 -0
  92. package/src/design-manifest/scan.test.ts +51 -0
  93. package/src/design-manifest/scan.ts +74 -0
  94. package/src/design-manifest/types.ts +40 -0
  95. package/src/design-rubric/rubric.test.ts +43 -0
  96. package/src/design-rubric/rubric.ts +140 -0
  97. package/src/eval/briefs.ts +104 -0
  98. package/src/eval/eval.test.ts +99 -0
  99. package/src/eval/index.ts +11 -0
  100. package/src/eval/score.ts +112 -0
  101. package/src/index.ts +75 -0
  102. package/src/prompts/design-prompts.test.ts +51 -0
  103. package/src/prompts/design-prompts.ts +127 -0
  104. package/src/resources/catalog.ts +23 -0
  105. package/src/resources/guides.ts +60 -0
  106. package/src/server.test.ts +69 -0
  107. package/src/server.ts +48 -0
  108. package/src/tools/find-components.ts +83 -0
  109. package/src/tools/find-icons.ts +77 -0
  110. package/src/tools/get-checklist.ts +139 -0
  111. package/src/tools/get-component.ts +204 -0
  112. package/src/tools/get-css-reference.ts +446 -0
  113. package/src/tools/get-design-context.ts +43 -0
  114. package/src/tools/get-design-principles.ts +72 -0
  115. package/src/tools/get-pattern.ts +69 -0
  116. package/src/tools/get-recipe.ts +80 -0
  117. package/src/tools/record-design-decision.ts +99 -0
  118. package/src/tools/suggest-implementation.ts +251 -0
  119. package/src/tools/sync-design-manifest.ts +92 -0
  120. package/src/tools/validate-design.ts +84 -0
  121. package/src/transports/http.ts +79 -0
  122. package/src/transports/stdio.ts +7 -0
  123. package/src/utils/format-catalog.test.ts +144 -0
  124. package/src/utils/format-catalog.ts +130 -0
  125. package/src/utils/paths.test.ts +101 -0
  126. package/src/utils/paths.ts +78 -0
  127. package/src/utils/search.test.ts +141 -0
  128. package/src/utils/search.ts +106 -0
  129. package/tsconfig.json +27 -0
  130. package/vitest.config.ts +15 -0
@@ -0,0 +1,140 @@
1
+ /**
2
+ * The design-quality scoring rubric — the qualitative half of the design loop
3
+ * (docs/DESIGN-MCP.md, step 3). Where `validate_design` answers "is it correct?"
4
+ * deterministically, the rubric answers "is it good?" through a judge.
5
+ *
6
+ * The eight criteria have been validated empirically against design-quality
7
+ * comparisons, scoring each 1–5 and summing to /40. Keeping the same instrument
8
+ * means new evaluations are directly comparable to that baseline. This is the
9
+ * SINGLE SOURCE for the criteria: the
10
+ * `get_design_principles(as="rubric")` tool renders it to Markdown, and the
11
+ * eval-suite (WP5) imports the same constants to score programmatically.
12
+ */
13
+
14
+ export interface RubricCriterion {
15
+ id: string;
16
+ /** Display name for the criterion. */
17
+ name: string;
18
+ /** One line on what the criterion measures. */
19
+ measures: string;
20
+ /** Anchored descriptions for scores 1, 3 and 5 (the judge interpolates 2 and 4). */
21
+ anchors: { 1: string; 3: string; 5: string };
22
+ }
23
+
24
+ export const RUBRIC_CRITERIA: readonly RubricCriterion[] = [
25
+ {
26
+ id: 'distinctiveness',
27
+ name: 'Design Language Distinctiveness',
28
+ measures: 'Whether the page has its own visual identity or reads as a generic template.',
29
+ anchors: {
30
+ 1: 'The most common layout imaginable — a Tailwind-UI starter with no personality.',
31
+ 3: 'A few custom touches (one heading style, one composition) over conventional bones.',
32
+ 5: 'A coherent, deliberate identity: custom compositions over default components, a consistent typographic voice, signature moments.'
33
+ }
34
+ },
35
+ {
36
+ id: 'color',
37
+ name: 'Color Scheme Coherence',
38
+ measures: 'Whether colour carries meaning or merely decorates.',
39
+ anchors: {
40
+ 1: 'Decorative colour — a rainbow of intents; intent colours where neutral belongs.',
41
+ 3: 'Intent mapping mostly correct, but some decorative or noisy colour remains.',
42
+ 5: 'Neutral surfaces dominate (80–90%); intent colour appears only for genuine status, severity, or action.'
43
+ }
44
+ },
45
+ {
46
+ id: 'spacing',
47
+ name: 'Spacing Consistency',
48
+ measures: 'Whether spacing expresses hierarchy.',
49
+ anchors: {
50
+ 1: 'One uniform rhythm everywhere (e.g. all `space-y-6`).',
51
+ 3: 'Some variation, but no clear within-vs-between system.',
52
+ 5: 'A clear two-tier rhythm (tight within items, generous between sections), with data-driven variation where it helps.'
53
+ }
54
+ },
55
+ {
56
+ id: 'radius',
57
+ name: 'Radius & Shape Language',
58
+ measures: 'Whether shape is a deliberate choice.',
59
+ anchors: {
60
+ 1: 'Zero radius intent — component defaults only, no shape strategy.',
61
+ 3: 'Some radius use, but inconsistent (mixed methods, no hierarchy).',
62
+ 5: 'A deliberate radius hierarchy (e.g. hero > standard > compact) applied consistently via `class`/`slotClasses`.'
63
+ }
64
+ },
65
+ {
66
+ id: 'ux',
67
+ name: 'UX Pattern Originality',
68
+ measures: 'Whether interaction patterns go beyond the textbook.',
69
+ anchors: {
70
+ 1: 'Textbook only — divider lists, stacked buttons, defaults throughout.',
71
+ 3: 'A few genuine UX touches (a thoughtful empty state, a useful affordance).',
72
+ 5: 'Creative, effective patterns that serve the content — original compositions, state-driven layout.'
73
+ }
74
+ },
75
+ {
76
+ id: 'hierarchy',
77
+ name: 'Visual Hierarchy',
78
+ measures: 'Whether the eye is guided to what matters.',
79
+ anchors: {
80
+ 1: 'Everything equally weighted — nothing dominates; labels compete with data.',
81
+ 3: 'Some dominance, but flat regions remain.',
82
+ 5: 'Each section has one clearly dominant element; metadata is recessed; visual weight tracks importance.'
83
+ }
84
+ },
85
+ {
86
+ id: 'cohesion',
87
+ name: 'Overall Design Cohesion',
88
+ measures: 'Whether the page reads as one designed artifact.',
89
+ anchors: {
90
+ 1: 'Cohesive only through sameness, or parts feel grafted on / disconnected.',
91
+ 3: 'Mostly unified, with a section or two that drift.',
92
+ 5: 'A single design DNA — consistent radius, typographic voice, and component logic tie the whole page together.'
93
+ }
94
+ },
95
+ {
96
+ id: 'correctness',
97
+ name: 'Technical Correctness',
98
+ measures: 'Whether the code is valid and uses real component APIs and design tokens.',
99
+ anchors: {
100
+ 1: 'Hallucinated tokens, broken dynamic classes, or wrong component APIs — would not render as intended.',
101
+ 3: 'Largely correct with a few token or API slips.',
102
+ 5: 'Valid semantic tokens, correct Svelte 5 and component APIs, no broken classes. Anchor this with `validate_design` — a passing linter (0 errors/warnings) puts this at 4–5.'
103
+ }
104
+ }
105
+ ];
106
+
107
+ export const MAX_RUBRIC_SCORE = RUBRIC_CRITERIA.length * 5;
108
+
109
+ /** Render the rubric as Markdown for a judge (served by `get_design_principles(as="rubric")`). */
110
+ export function renderRubric(): string {
111
+ let md = '# Design-Quality Rubric\n\n';
112
+ md += `Score a generated UI on each of the ${RUBRIC_CRITERIA.length} criteria from **1 to 5**, then sum to **/${MAX_RUBRIC_SCORE}**. `;
113
+ md +=
114
+ 'For every score, cite specific evidence from the code (a class, a component, a layout choice) — a number without a reason is not a judgement.\n\n';
115
+ md += '**Before scoring, run `validate_design` on the code.** It deterministically catches the ';
116
+ md +=
117
+ 'correctness failures (hallucinated tokens, broken dynamic classes) that a judge tends to miss, and it anchors the *Technical Correctness* criterion.\n\n';
118
+
119
+ for (const [i, c] of RUBRIC_CRITERIA.entries()) {
120
+ md += `## ${i + 1}. ${c.name}\n\n`;
121
+ md += `*${c.measures}*\n\n`;
122
+ md += `- **1** — ${c.anchors[1]}\n`;
123
+ md += `- **3** — ${c.anchors[3]}\n`;
124
+ md += `- **5** — ${c.anchors[5]}\n\n`;
125
+ }
126
+
127
+ md += '---\n\n';
128
+ md += '## Using the rubric\n\n';
129
+ md +=
130
+ '- **As a single judge:** score all criteria, sum to /' +
131
+ MAX_RUBRIC_SCORE +
132
+ ', and list the two lowest as the concrete revision targets.\n';
133
+ md +=
134
+ '- **As a panel (recommended for variant selection):** run one judge per *lens* — correctness, hierarchy, paradigm-fidelity, distinctiveness — rather than N identical judges. Diversity of lens catches failures redundancy cannot.\n';
135
+ md +=
136
+ '- **For N variants:** score each, pick the winner, then graft the best ideas from the runners-up before a final `validate_design` pass.\n';
137
+ md +=
138
+ '- **Reward deviation within the rules.** A safe, generic page should not outscore a distinctive one that stays inside the paradigm. Penalise AI-slop sameness on *Distinctiveness* and *UX Pattern Originality*.\n';
139
+ return md;
140
+ }
@@ -0,0 +1,104 @@
1
+ /**
2
+ * The eval-suite design briefs — the standing benchmark for "does a change to the
3
+ * design layers actually improve generation, or just lengthen the context?"
4
+ * (docs/DESIGN-MCP.md, cross-cutting eval). Systematises earlier one-off
5
+ * design-quality comparisons into a repeatable set.
6
+ *
7
+ * Each brief is generated under two conditions (baseline vs. design-MCP loop),
8
+ * then scored by `scoreImplementation` (deterministic linter) and the judge
9
+ * rubric (get_design_principles(as="rubric")). The brief spread covers the page
10
+ * archetypes the composition patterns target, plus a couple of component-dense
11
+ * surfaces where token hallucination tends to spike.
12
+ */
13
+
14
+ export interface EvalBrief {
15
+ id: string;
16
+ title: string;
17
+ /** The prompt handed to the generating instance. */
18
+ prompt: string;
19
+ /** Suggested composition pattern, when one fits (for the design-MCP condition). */
20
+ pattern?: string;
21
+ tags: string[];
22
+ }
23
+
24
+ export const EVAL_BRIEFS: readonly EvalBrief[] = [
25
+ {
26
+ id: 'ops-dashboard',
27
+ title: 'Ops Dashboard',
28
+ prompt:
29
+ 'Build an operations dashboard for a SaaS product: a row of KPI tiles (users, revenue, error rate, latency), a primary chart area, a recent-activity feed, and a quick-actions panel. Some metrics are healthy, some are degraded.',
30
+ pattern: 'dashboard',
31
+ tags: ['dashboard', 'data-dense', 'status']
32
+ },
33
+ {
34
+ id: 'account-settings',
35
+ title: 'Account Settings',
36
+ prompt:
37
+ 'Build an account settings page with sections for Profile, Notifications, and Security. Booleans use toggles, enums use selects, and there is a destructive "Delete account" action separated at the bottom.',
38
+ pattern: 'settings-page',
39
+ tags: ['settings', 'form', 'destructive']
40
+ },
41
+ {
42
+ id: 'signup-wizard',
43
+ title: 'Multi-Step Signup',
44
+ prompt:
45
+ 'Build a three-step signup wizard (account → organisation → review) with inline field validation, a progress indicator, and a sticky continue/back action bar.',
46
+ pattern: 'form-page',
47
+ tags: ['form', 'wizard', 'validation']
48
+ },
49
+ {
50
+ id: 'pricing',
51
+ title: 'Pricing Page',
52
+ prompt:
53
+ 'Build a pricing page with three tiers (Starter, Pro, Enterprise), one highlighted as recommended, each with a feature list and a call-to-action. Include a monthly/yearly toggle.',
54
+ tags: ['marketing', 'cards', 'cta']
55
+ },
56
+ {
57
+ id: 'users-table',
58
+ title: 'Users Admin Table',
59
+ prompt:
60
+ 'Build a users administration screen: a searchable, filterable table of users with avatar, name, role badge, status, and per-row actions (edit, suspend). Include a header with a search field and a "Add user" button.',
61
+ tags: ['data-table', 'admin', 'data-dense']
62
+ },
63
+ {
64
+ id: 'empty-inbox',
65
+ title: 'Empty Inbox State',
66
+ prompt:
67
+ 'Build an empty state for an inbox that has no messages yet: an illustration or icon, a friendly headline, a one-line explanation, and a primary action to compose the first message.',
68
+ tags: ['empty-state', 'feedback']
69
+ },
70
+ {
71
+ id: 'login',
72
+ title: 'Login Page',
73
+ prompt:
74
+ 'Build a login page with email and password fields, a "forgot password" link, a primary sign-in button, and a secondary "Sign in with a passkey" option. Show inline validation on the fields.',
75
+ pattern: 'form-page',
76
+ tags: ['auth', 'form']
77
+ },
78
+ {
79
+ id: 'profile-card',
80
+ title: 'Profile Card',
81
+ prompt:
82
+ 'Build a user profile card: avatar, name, role, a short bio, three stat figures (followers, following, posts), and follow / message actions.',
83
+ tags: ['identity', 'card']
84
+ },
85
+ {
86
+ id: 'checkout-summary',
87
+ title: 'Checkout Summary',
88
+ prompt:
89
+ 'Build a checkout order summary: a list of line items with quantity and price, a promo-code field, a subtotal/tax/total breakdown, and a sticky "Pay now" button. One item is on sale.',
90
+ tags: ['commerce', 'data-dense', 'sticky']
91
+ },
92
+ {
93
+ id: 'notification-center',
94
+ title: 'Notification Center',
95
+ prompt:
96
+ 'Build a notification center panel: notifications grouped by Today / Earlier, each with an icon by type (mention, system, billing), a timestamp, read/unread state, and a dismiss control. Include a "Mark all as read" action.',
97
+ pattern: 'tab-navigation',
98
+ tags: ['feedback', 'list', 'state-driven']
99
+ }
100
+ ];
101
+
102
+ export function getBriefById(id: string): EvalBrief | undefined {
103
+ return EVAL_BRIEFS.find((b) => b.id === id);
104
+ }
@@ -0,0 +1,99 @@
1
+ import { describe, expect, it } from 'vitest';
2
+ import { RUBRIC_CRITERIA } from '../design-rubric/rubric.js';
3
+ import { EVAL_BRIEFS, getBriefById } from './briefs.js';
4
+ import type { EvalEntry } from './score.js';
5
+ import { aggregateRubric, formatAbReport, scoreImplementation } from './score.js';
6
+
7
+ describe('eval briefs', () => {
8
+ it('provides ~10 briefs with unique ids', () => {
9
+ expect(EVAL_BRIEFS.length).toBeGreaterThanOrEqual(10);
10
+ expect(new Set(EVAL_BRIEFS.map((b) => b.id)).size).toBe(EVAL_BRIEFS.length);
11
+ });
12
+ it('only references real composition patterns', () => {
13
+ const known = new Set([
14
+ 'dashboard',
15
+ 'form-page',
16
+ 'settings-page',
17
+ 'tab-navigation',
18
+ 'onboarding-guide'
19
+ ]);
20
+ for (const b of EVAL_BRIEFS) {
21
+ if (b.pattern) expect(known, `${b.id} → ${b.pattern}`).toContain(b.pattern);
22
+ }
23
+ });
24
+ it('looks up by id', () => {
25
+ expect(getBriefById('ops-dashboard')?.title).toBe('Ops Dashboard');
26
+ expect(getBriefById('nope')).toBeUndefined();
27
+ });
28
+ });
29
+
30
+ describe('scoreImplementation', () => {
31
+ it('scores clean code 100 with no findings', () => {
32
+ const s = scoreImplementation('<div class="bg-surface-base text-text-primary">ok</div>');
33
+ expect(s.score).toBe(100);
34
+ expect(s.errors).toBe(0);
35
+ });
36
+ it('penalises hallucinated tokens and raw colours', () => {
37
+ const s = scoreImplementation('<div class="bg-blue-500 text-status-bad">x</div>');
38
+ expect(s.score).toBeLessThan(100);
39
+ expect(s.errors + s.warnings).toBeGreaterThan(0);
40
+ });
41
+ });
42
+
43
+ describe('aggregateRubric', () => {
44
+ const full = Object.fromEntries(RUBRIC_CRITERIA.map((c) => [c.id, 4]));
45
+
46
+ it('sums a complete 1–5 score set', () => {
47
+ expect(aggregateRubric(full)).toBe(RUBRIC_CRITERIA.length * 4);
48
+ });
49
+ it('throws on a missing, out-of-range, or non-finite criterion', () => {
50
+ expect(() => aggregateRubric({ ...full, correctness: 7 })).toThrow();
51
+ expect(() => aggregateRubric({ ...full, correctness: NaN })).toThrow();
52
+ const { correctness, ...missing } = full;
53
+ void correctness;
54
+ expect(() => aggregateRubric(missing)).toThrow();
55
+ });
56
+ });
57
+
58
+ describe('formatAbReport edge cases', () => {
59
+ it('reports a clean-baseline error delta as "—", not a false percentage', () => {
60
+ const entries: EvalEntry[] = [
61
+ {
62
+ briefId: 'x',
63
+ condition: 'baseline',
64
+ score: { linter: { score: 100, errors: 0, warnings: 0, infos: 0 } }
65
+ },
66
+ {
67
+ briefId: 'x',
68
+ condition: 'design-mcp',
69
+ score: { linter: { score: 100, errors: 0, warnings: 0, infos: 0 } }
70
+ }
71
+ ];
72
+ const report = formatAbReport(entries, 'baseline', 'design-mcp');
73
+ expect(report).not.toContain('-100.0%');
74
+ });
75
+ });
76
+
77
+ describe('formatAbReport', () => {
78
+ const entries: EvalEntry[] = [
79
+ {
80
+ briefId: 'a',
81
+ condition: 'baseline',
82
+ score: { linter: { score: 70, errors: 1, warnings: 2, infos: 0 }, rubricTotal: 22 }
83
+ },
84
+ {
85
+ briefId: 'a',
86
+ condition: 'design-mcp',
87
+ score: { linter: { score: 95, errors: 0, warnings: 0, infos: 1 }, rubricTotal: 31 }
88
+ }
89
+ ];
90
+ const report = formatAbReport(entries, 'baseline', 'design-mcp');
91
+
92
+ it('renders per-brief and aggregate sections with a delta', () => {
93
+ expect(report).toContain('## Per-brief');
94
+ expect(report).toContain('## Aggregate');
95
+ expect(report).toContain('baseline');
96
+ expect(report).toContain('design-mcp');
97
+ expect(report).toContain('%'); // a percentage delta is present
98
+ });
99
+ });
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Public API of the eval-suite (docs/DESIGN-MCP.md, cross-cutting eval). The
3
+ * briefs and the deterministic scorer are reusable across runs; the LLM
4
+ * generation and rubric judging are injected by whoever drives a run (an agent
5
+ * harness this round, an API loop later) so the suite stays automation-friendly.
6
+ */
7
+
8
+ export type { EvalBrief } from './briefs.js';
9
+ export { EVAL_BRIEFS, getBriefById } from './briefs.js';
10
+ export type { EvalEntry, ImplementationScore, LinterScore } from './score.js';
11
+ export { aggregateRubric, formatAbReport, scoreImplementation } from './score.js';
@@ -0,0 +1,112 @@
1
+ /**
2
+ * Scoring + A/B reporting for the eval-suite. The linter half is deterministic
3
+ * (no LLM): `scoreImplementation` runs the design-linter and returns the score
4
+ * and finding counts. The rubric half is judge-supplied (1–5 per criterion);
5
+ * `aggregateRubric` validates and sums it. `formatAbReport` renders the
6
+ * baseline-vs-treatment comparison in the shape of experiment/round-2's summary
7
+ * tables, so a new run is directly comparable to that +33.8% baseline.
8
+ */
9
+
10
+ import { lintDesign } from '../design-linter/index.js';
11
+ import { MAX_RUBRIC_SCORE, RUBRIC_CRITERIA } from '../design-rubric/rubric.js';
12
+
13
+ export interface LinterScore {
14
+ /** 0–100 deterministic design-linter score. */
15
+ score: number;
16
+ errors: number;
17
+ warnings: number;
18
+ infos: number;
19
+ }
20
+
21
+ export interface ImplementationScore {
22
+ linter: LinterScore;
23
+ /** Judge rubric total /MAX_RUBRIC_SCORE, when a rubric pass was run. */
24
+ rubricTotal?: number;
25
+ }
26
+
27
+ export interface EvalEntry {
28
+ briefId: string;
29
+ /** e.g. "baseline" or "design-mcp". */
30
+ condition: string;
31
+ score: ImplementationScore;
32
+ }
33
+
34
+ /** Deterministic linter score for one generated implementation. */
35
+ export function scoreImplementation(code: string): LinterScore {
36
+ const r = lintDesign(code);
37
+ return {
38
+ score: r.score,
39
+ errors: r.counts.error,
40
+ warnings: r.counts.warning,
41
+ infos: r.counts.info
42
+ };
43
+ }
44
+
45
+ /** Validate + sum a judge's per-criterion 1–5 scores into a /40 total. Missing/out-of-range entries throw. */
46
+ export function aggregateRubric(perCriterion: Record<string, number>): number {
47
+ let total = 0;
48
+ for (const c of RUBRIC_CRITERIA) {
49
+ const v = perCriterion[c.id];
50
+ if (typeof v !== 'number' || !Number.isFinite(v) || v < 1 || v > 5) {
51
+ throw new Error(`Rubric score for "${c.id}" must be a finite 1–5, got ${v}`);
52
+ }
53
+ total += v;
54
+ }
55
+ return total;
56
+ }
57
+
58
+ function mean(xs: number[]): number {
59
+ return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
60
+ }
61
+
62
+ function pct(from: number, to: number): string {
63
+ if (from === 0) return '—';
64
+ const change = ((to - from) / from) * 100;
65
+ return `${change >= 0 ? '+' : ''}${change.toFixed(1)}%`;
66
+ }
67
+
68
+ /**
69
+ * Render a baseline-vs-treatment A/B report. `baseline` and `treatment` are the
70
+ * condition labels present in `entries`. Per-brief rows + an aggregate summary.
71
+ */
72
+ export function formatAbReport(entries: EvalEntry[], baseline: string, treatment: string): string {
73
+ const briefIds = [...new Set(entries.map((e) => e.briefId))];
74
+ const pick = (briefId: string, condition: string) =>
75
+ entries.find((e) => e.briefId === briefId && e.condition === condition)?.score;
76
+
77
+ let md = `# Eval A/B — ${baseline} vs ${treatment}\n\n`;
78
+ md += '## Per-brief\n\n';
79
+ md += `| Brief | ${baseline} linter | ${treatment} linter | ${baseline} rubric | ${treatment} rubric |\n`;
80
+ md += '|---|---|---|---|---|\n';
81
+
82
+ const baseLint: number[] = [];
83
+ const treatLint: number[] = [];
84
+ const baseRub: number[] = [];
85
+ const treatRub: number[] = [];
86
+
87
+ for (const id of briefIds) {
88
+ const b = pick(id, baseline);
89
+ const t = pick(id, treatment);
90
+ if (b) baseLint.push(b.linter.score);
91
+ if (t) treatLint.push(t.linter.score);
92
+ if (b?.rubricTotal !== undefined) baseRub.push(b.rubricTotal);
93
+ if (t?.rubricTotal !== undefined) treatRub.push(t.rubricTotal);
94
+ const rub = (s?: ImplementationScore) =>
95
+ s?.rubricTotal !== undefined ? `${s.rubricTotal}/${MAX_RUBRIC_SCORE}` : '—';
96
+ md += `| ${id} | ${b?.linter.score ?? '—'} | ${t?.linter.score ?? '—'} | ${rub(b)} | ${rub(t)} |\n`;
97
+ }
98
+
99
+ md += '\n## Aggregate\n\n';
100
+ md += `| Metric | ${baseline} | ${treatment} | Δ |\n|---|---|---|---|\n`;
101
+ md += `| Mean linter score | ${mean(baseLint).toFixed(1)} | ${mean(treatLint).toFixed(1)} | ${pct(mean(baseLint), mean(treatLint))} |\n`;
102
+ if (baseRub.length && treatRub.length) {
103
+ md += `| Mean rubric /${MAX_RUBRIC_SCORE} | ${mean(baseRub).toFixed(1)} | ${mean(treatRub).toFixed(1)} | ${pct(mean(baseRub), mean(treatRub))} |\n`;
104
+ }
105
+ const totalErr = (cond: string) =>
106
+ entries
107
+ .filter((e) => e.condition === cond)
108
+ .reduce((a, e) => a + e.score.linter.errors + e.score.linter.warnings, 0);
109
+ md += `| Total linter errors+warnings | ${totalErr(baseline)} | ${totalErr(treatment)} | ${pct(totalErr(baseline), totalErr(treatment))} |\n`;
110
+
111
+ return md;
112
+ }
package/src/index.ts ADDED
@@ -0,0 +1,75 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { loadCatalog } from './data/catalog-loader.js';
4
+ import { loadPatterns, loadPrinciples } from './data/design-system-loader.js';
5
+ import { loadRecipes } from './data/recipe-loader.js';
6
+ import { loadTemplateSections } from './data/template-loader.js';
7
+ import { createServer } from './server.js';
8
+ import { startHttpTransport } from './transports/http.js';
9
+ import { startStdioTransport } from './transports/stdio.js';
10
+
11
+ interface CliArgs {
12
+ transport: 'stdio' | 'http';
13
+ port: number;
14
+ dataDir?: string;
15
+ }
16
+
17
+ function parseArgs(args: string[]): CliArgs {
18
+ const result: CliArgs = {
19
+ transport: 'stdio',
20
+ port: 3001
21
+ };
22
+
23
+ for (let i = 0; i < args.length; i++) {
24
+ const arg = args[i];
25
+ const next = args[i + 1];
26
+
27
+ if (arg === '--transport' && next) {
28
+ if (next === 'stdio' || next === 'http') {
29
+ result.transport = next;
30
+ }
31
+ i++;
32
+ } else if (arg === '--port' && next) {
33
+ result.port = parseInt(next, 10);
34
+ i++;
35
+ } else if (arg === '--data-dir' && next) {
36
+ result.dataDir = next;
37
+ i++;
38
+ }
39
+ }
40
+
41
+ return result;
42
+ }
43
+
44
+ async function main(): Promise<void> {
45
+ const args = parseArgs(process.argv.slice(2));
46
+
47
+ if (args.dataDir) {
48
+ process.env.DATA_DIR = args.dataDir;
49
+ }
50
+
51
+ // Pre-load cached data
52
+ try {
53
+ await Promise.all([
54
+ loadCatalog(),
55
+ loadTemplateSections(),
56
+ loadRecipes(),
57
+ loadPrinciples(),
58
+ loadPatterns()
59
+ ]);
60
+ } catch (err) {
61
+ console.error('Warning: Failed to pre-load some data:', err);
62
+ }
63
+
64
+ if (args.transport === 'http') {
65
+ await startHttpTransport(args.port);
66
+ } else {
67
+ const server = createServer();
68
+ await startStdioTransport(server);
69
+ }
70
+ }
71
+
72
+ main().catch((err) => {
73
+ console.error('Fatal error:', err);
74
+ process.exit(1);
75
+ });
@@ -0,0 +1,51 @@
1
+ import { describe, expect, it } from 'vitest';
2
+ import { designPagePrompt, redesignPrompt, variantCount } from './design-prompts.js';
3
+
4
+ describe('variantCount', () => {
5
+ it('defaults to 3 for missing or non-numeric input', () => {
6
+ expect(variantCount(undefined)).toBe(3);
7
+ expect(variantCount('abc')).toBe(3);
8
+ });
9
+ it('clamps to the 2–5 range', () => {
10
+ expect(variantCount('1')).toBe(2);
11
+ expect(variantCount('9')).toBe(5);
12
+ expect(variantCount('4')).toBe(4);
13
+ });
14
+ });
15
+
16
+ describe('designPagePrompt', () => {
17
+ it('embeds the brief and drives the full loop in order', () => {
18
+ const p = designPagePrompt('a billing settings page', undefined, '4');
19
+ expect(p).toContain('a billing settings page');
20
+ for (const tool of [
21
+ 'get_design_context',
22
+ 'get_design_principles',
23
+ 'validate_design',
24
+ 'get_design_principles(as="rubric")',
25
+ 'sync_design_manifest'
26
+ ]) {
27
+ expect(p, tool).toContain(tool);
28
+ }
29
+ expect(p).toContain('Generate 4 variants');
30
+ });
31
+
32
+ it('pins a specific pattern when given one', () => {
33
+ expect(designPagePrompt('x', 'dashboard', undefined)).toContain('get_pattern("dashboard")');
34
+ });
35
+ it('offers pattern discovery when none is given', () => {
36
+ expect(designPagePrompt('x', undefined, undefined)).toContain('if a composition pattern fits');
37
+ });
38
+ });
39
+
40
+ describe('redesignPrompt', () => {
41
+ it('is diagnosis-first and preserves behaviour', () => {
42
+ const p = redesignPrompt('the dashboard feels flat', undefined, undefined);
43
+ expect(p).toContain('Diagnose');
44
+ expect(p).toContain('validate_design');
45
+ expect(p).toContain('two lowest-scoring criteria');
46
+ expect(p).toContain('read the current implementation');
47
+ });
48
+ it('inlines provided code', () => {
49
+ expect(redesignPrompt('x', '<div>old</div>', undefined)).toContain('<div>old</div>');
50
+ });
51
+ });