@adia-ai/a2ui-validator 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +36 -0
- package/README.md +54 -0
- package/catalog-validator.js +162 -0
- package/index.js +11 -0
- package/package.json +35 -0
- package/semantic/cache.js +54 -0
- package/semantic/index.js +163 -0
- package/semantic/judge.js +180 -0
- package/validator.js +1074 -0
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM judge for semantic A2UI validation (Phase 1: Anthropic only).
|
|
3
|
+
*
|
|
4
|
+
* Implements the rubric from docs/specs/semantic-validator.md §4 (recommendation
|
|
5
|
+
* is pure 3a LLM-judge as first phase). Provider-agnostic shape per §5.3;
|
|
6
|
+
* only Anthropic wired here.
|
|
7
|
+
*
|
|
8
|
+
* Contract:
|
|
9
|
+
* callJudge({ intent, componentSummary, rubricVersion }) →
|
|
10
|
+
* { raw, parsed: SemanticVerdict, usage }
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const RUBRIC_VERSION = 'v1';
|
|
14
|
+
|
|
15
|
+
const RUBRIC_TEXT = `You are a senior UI reviewer scoring whether an emitted UI actually
|
|
16
|
+
satisfies a user's intent. You are NOT scoring visual polish or copy.
|
|
17
|
+
You are scoring intent/output alignment on three axes.
|
|
18
|
+
|
|
19
|
+
Score each axis 0-100. Cite evidence from the component summary.
|
|
20
|
+
|
|
21
|
+
Axes:
|
|
22
|
+
1. dominantPattern: does the top-level pattern match what the intent asked
|
|
23
|
+
for? (form | data-display | navigation | marketing | feedback | settings
|
|
24
|
+
| chat | media | dashboard | detail | list | auth | other). A pricing
|
|
25
|
+
card in response to "signup form" is a dominant-pattern mismatch — score
|
|
26
|
+
very low (<15) regardless of filler components.
|
|
27
|
+
2. requiredCapabilities: does the output contain the specific controls /
|
|
28
|
+
affordances the intent requires? (e.g. "login form" requires Input(email
|
|
29
|
+
or username) + Input(password) + Button(submit); "users table" requires
|
|
30
|
+
a Table or tabular Grid with rows). List what was expected and what was
|
|
31
|
+
missing.
|
|
32
|
+
3. forbiddenNoise: are there prominent components that are off-topic for
|
|
33
|
+
this intent? (e.g. pricing tiers in a signup form, avatars in a data
|
|
34
|
+
table). 100 = nothing off-topic; drop for each noisy element.
|
|
35
|
+
|
|
36
|
+
Overall verdict:
|
|
37
|
+
aligned — all three axes ≥ 75
|
|
38
|
+
partial — one axis < 60 OR overall score 40-74
|
|
39
|
+
misaligned — dominantPattern < 40 OR two axes < 60
|
|
40
|
+
off-topic — dominantPattern < 15
|
|
41
|
+
|
|
42
|
+
Final score = round(0.5*dominantPattern + 0.35*requiredCapabilities + 0.15*forbiddenNoise).
|
|
43
|
+
|
|
44
|
+
Return ONLY valid JSON, no prose, no markdown fences:
|
|
45
|
+
{
|
|
46
|
+
"axes": {
|
|
47
|
+
"dominantPattern": { "expected": "<string>", "observed": "<string>", "score": <0-100>, "evidence": "<short string>" },
|
|
48
|
+
"requiredCapabilities": { "expected": ["..."], "missing": ["..."], "score": <0-100> },
|
|
49
|
+
"forbiddenNoise": { "observed": ["..."], "score": <0-100> }
|
|
50
|
+
},
|
|
51
|
+
"verdict": "aligned" | "partial" | "misaligned" | "off-topic",
|
|
52
|
+
"score": <0-100>,
|
|
53
|
+
"rationale": "<<= 240 chars>",
|
|
54
|
+
"evidence": ["<component id or short quote>", ...]
|
|
55
|
+
}`;
|
|
56
|
+
|
|
57
|
+
export function getRubric() {
|
|
58
|
+
return { version: RUBRIC_VERSION, text: RUBRIC_TEXT };
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Summarize emitted A2UI messages into a compact human-readable string
|
|
63
|
+
* (§5.4). Handles both monolithic (messages with .components array) and
|
|
64
|
+
* zettel (flat beginComponent/endComponent stream) shapes.
|
|
65
|
+
*/
|
|
66
|
+
export function summarizeA2UI(messages) {
|
|
67
|
+
const msgs = Array.isArray(messages) ? messages : [];
|
|
68
|
+
const counts = new Map();
|
|
69
|
+
const samples = new Map(); // component -> [labels]
|
|
70
|
+
let rootType = null;
|
|
71
|
+
let totalComponents = 0;
|
|
72
|
+
|
|
73
|
+
const record = (type, props) => {
|
|
74
|
+
if (!type) return;
|
|
75
|
+
totalComponents += 1;
|
|
76
|
+
counts.set(type, (counts.get(type) || 0) + 1);
|
|
77
|
+
if (!rootType) rootType = type;
|
|
78
|
+
const label = props?.label || props?.text || props?.title || props?.placeholder;
|
|
79
|
+
if (label && typeof label === 'string') {
|
|
80
|
+
const arr = samples.get(type) || [];
|
|
81
|
+
if (arr.length < 3) arr.push(label.slice(0, 32));
|
|
82
|
+
samples.set(type, arr);
|
|
83
|
+
}
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
for (const m of msgs) {
|
|
87
|
+
if (Array.isArray(m?.components)) {
|
|
88
|
+
for (const c of m.components) record(c.component, c.props || c);
|
|
89
|
+
} else if (m?.componentType && m?.messageType !== 'endComponent') {
|
|
90
|
+
record(m.componentType, m.props || m);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
const parts = [];
|
|
95
|
+
for (const [type, n] of [...counts.entries()].sort((a, b) => b[1] - a[1])) {
|
|
96
|
+
const s = samples.get(type);
|
|
97
|
+
const lbl = s && s.length ? ` (labels: ${s.map((x) => JSON.stringify(x)).join(', ')})` : '';
|
|
98
|
+
parts.push(`${type}×${n}${lbl}`);
|
|
99
|
+
}
|
|
100
|
+
return `Root: ${rootType || 'none'}\nTotal components: ${totalComponents}\nBreakdown: ${parts.join('; ') || '(empty)'}`;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
function parseJSONLoose(raw) {
|
|
104
|
+
if (!raw) throw new Error('empty judge response');
|
|
105
|
+
let text = String(raw).trim();
|
|
106
|
+
// Strip markdown fences if present.
|
|
107
|
+
const fence = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/);
|
|
108
|
+
if (fence) text = fence[1].trim();
|
|
109
|
+
// Grab the first top-level { ... } block.
|
|
110
|
+
const first = text.indexOf('{');
|
|
111
|
+
const last = text.lastIndexOf('}');
|
|
112
|
+
if (first < 0 || last < 0 || last <= first) throw new Error('no JSON object in judge response');
|
|
113
|
+
const slice = text.slice(first, last + 1);
|
|
114
|
+
return JSON.parse(slice);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Call the LLM judge. Anthropic-only in Phase 1.
|
|
119
|
+
* Returns { raw, parsed, usage, latencyMs, provider, model }.
|
|
120
|
+
*/
|
|
121
|
+
export async function callJudge({
|
|
122
|
+
intent,
|
|
123
|
+
componentSummary,
|
|
124
|
+
rubricVersion = RUBRIC_VERSION,
|
|
125
|
+
timeoutMs = 15000,
|
|
126
|
+
model = 'claude-haiku-4-5-20251001',
|
|
127
|
+
} = {}) {
|
|
128
|
+
const key = process.env.ANTHROPIC_API_KEY;
|
|
129
|
+
if (!key) throw new Error('ANTHROPIC_API_KEY missing — cannot call semantic judge');
|
|
130
|
+
|
|
131
|
+
const rubric = RUBRIC_TEXT;
|
|
132
|
+
const userBlock = `Intent: ${intent}\n\nEmitted UI summary:\n${componentSummary}\n\nRespond with JSON only.`;
|
|
133
|
+
|
|
134
|
+
const ac = new AbortController();
|
|
135
|
+
const timer = setTimeout(() => ac.abort(), timeoutMs);
|
|
136
|
+
const started = Date.now();
|
|
137
|
+
let res;
|
|
138
|
+
try {
|
|
139
|
+
res = await fetch('https://api.anthropic.com/v1/messages', {
|
|
140
|
+
method: 'POST',
|
|
141
|
+
headers: {
|
|
142
|
+
'content-type': 'application/json',
|
|
143
|
+
'x-api-key': key,
|
|
144
|
+
'anthropic-version': '2023-06-01',
|
|
145
|
+
},
|
|
146
|
+
body: JSON.stringify({
|
|
147
|
+
model,
|
|
148
|
+
max_tokens: 1024,
|
|
149
|
+
temperature: 0,
|
|
150
|
+
system: rubric,
|
|
151
|
+
messages: [{ role: 'user', content: userBlock }],
|
|
152
|
+
}),
|
|
153
|
+
signal: ac.signal,
|
|
154
|
+
});
|
|
155
|
+
} finally {
|
|
156
|
+
clearTimeout(timer);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
const latencyMs = Date.now() - started;
|
|
160
|
+
if (!res.ok) {
|
|
161
|
+
const body = await res.text().catch(() => '');
|
|
162
|
+
throw new Error(`Anthropic judge ${res.status}: ${body.slice(0, 200)}`);
|
|
163
|
+
}
|
|
164
|
+
const data = await res.json();
|
|
165
|
+
const raw = data.content?.[0]?.text || '';
|
|
166
|
+
const parsed = parseJSONLoose(raw);
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
raw,
|
|
170
|
+
parsed,
|
|
171
|
+
usage: {
|
|
172
|
+
inputTokens: data.usage?.input_tokens ?? 0,
|
|
173
|
+
outputTokens: data.usage?.output_tokens ?? 0,
|
|
174
|
+
},
|
|
175
|
+
latencyMs,
|
|
176
|
+
provider: 'anthropic',
|
|
177
|
+
model,
|
|
178
|
+
rubricVersion,
|
|
179
|
+
};
|
|
180
|
+
}
|