@adia-ai/a2ui-mcp 0.0.5 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Compare two eval-diff run artifacts (JSON) and surface verdict-distribution
4
+ * + pass-rate deltas. Used to verify "no unexplained regressions" between
5
+ * baseline and a candidate gating-mode run, per Phase 2 exit criteria of
6
+ * `docs/specs/semantic-validator.md`.
7
+ *
8
+ * Usage:
9
+ * node packages/a2ui/mcp/scripts/semantic-stats.mjs <baseline.json> <candidate.json>
10
+ *
11
+ * Examples:
12
+ * # Compare structural-only baseline to combined-gating candidate
13
+ * node packages/a2ui/mcp/scripts/semantic-stats.mjs \
14
+ * evals/mcp/runs/<baseline-stamp>/zettel.json \
15
+ * evals/mcp/runs/<candidate-stamp>/zettel.json
16
+ *
17
+ * The script is read-only; it writes nothing to disk. Output is markdown
18
+ * to stdout — pipe into a file or a journal entry as needed.
19
+ */
20
+ import { readFile } from 'node:fs/promises';
21
+
22
+ const args = process.argv.slice(2);
23
+ if (args.length !== 2) {
24
+ console.error('Usage: semantic-stats.mjs <baseline.json> <candidate.json>');
25
+ process.exit(2);
26
+ }
27
+
28
+ const [basePath, candPath] = args;
29
+ const base = JSON.parse(await readFile(basePath, 'utf8'));
30
+ const cand = JSON.parse(await readFile(candPath, 'utf8'));
31
+
32
+ const fmt = (v) => v == null ? '—' : String(v);
33
+ const delta = (a, b) => {
34
+ if (a == null || b == null) return '—';
35
+ const d = Math.round((b - a) * 10) / 10;
36
+ return d > 0 ? `+${d}` : String(d);
37
+ };
38
+
39
+ console.log(`# Semantic stats — baseline vs candidate\n`);
40
+ console.log(`- Baseline: \`${basePath}\` (label=${fmt(base.label)})`);
41
+ console.log(`- Candidate: \`${candPath}\` (label=${fmt(cand.label)})\n`);
42
+
43
+ console.log(`## Aggregate deltas\n`);
44
+ console.log(`| metric | baseline | candidate | delta |`);
45
+ console.log(`|---|---:|---:|---:|`);
46
+ console.log(`| total | ${base.total} | ${cand.total} | ${delta(base.total, cand.total)} |`);
47
+ console.log(`| coverage % | ${base.coverage} | ${cand.coverage} | ${delta(base.coverage, cand.coverage)} |`);
48
+ console.log(`| emitted | ${base.emitted}/${base.total} | ${cand.emitted}/${cand.total} | ${delta(base.emitted, cand.emitted)} |`);
49
+ console.log(`| avgScoreWhenEmitted | ${base.avgScoreWhenEmitted} | ${cand.avgScoreWhenEmitted} | ${delta(base.avgScoreWhenEmitted, cand.avgScoreWhenEmitted)} |`);
50
+ console.log(`| avgF1WhenEmitted | ${base.avgF1WhenEmitted} | ${cand.avgF1WhenEmitted} | ${delta(base.avgF1WhenEmitted, cand.avgF1WhenEmitted)} |`);
51
+ console.log(`| pass rate % | ${base.passRate} | ${cand.passRate} | ${delta(base.passRate, cand.passRate)} |`);
52
+
53
+ if (cand.passRateStructural != null) {
54
+ console.log(`| pass rate % (structural-only on candidate) | ${base.passRate} | ${cand.passRateStructural} | ${delta(base.passRate, cand.passRateStructural)} |`);
55
+ }
56
+
57
+ if (base.semantic || cand.semantic) {
58
+ console.log(`| avgSemanticScore | ${fmt(base.semantic?.avgSemanticScore)} | ${fmt(cand.semantic?.avgSemanticScore)} | ${delta(base.semantic?.avgSemanticScore, cand.semantic?.avgSemanticScore)} |`);
59
+ console.log(`| avgCombinedScore | ${fmt(base.semantic?.avgCombinedScore)} | ${fmt(cand.semantic?.avgCombinedScore)} | ${delta(base.semantic?.avgCombinedScore, cand.semantic?.avgCombinedScore)} |`);
60
+ }
61
+ console.log();
62
+
63
+ // Verdict distribution (when --semantic was used in either run)
64
+ function verdicts(run) {
65
+ return run.semantic?.verdictBreakdown || null;
66
+ }
67
+ const bv = verdicts(base);
68
+ const cv = verdicts(cand);
69
+ if (bv || cv) {
70
+ console.log(`## Verdict distribution\n`);
71
+ const allKeys = new Set([...(bv ? Object.keys(bv) : []), ...(cv ? Object.keys(cv) : [])]);
72
+ console.log(`| verdict | baseline | candidate | delta |`);
73
+ console.log(`|---|---:|---:|---:|`);
74
+ for (const k of [...allKeys].sort()) {
75
+ console.log(`| ${k} | ${fmt(bv?.[k] ?? 0)} | ${fmt(cv?.[k] ?? 0)} | ${delta(bv?.[k] ?? 0, cv?.[k] ?? 0)} |`);
76
+ }
77
+ console.log();
78
+ }
79
+
80
+ // Per-row pass-flip surface — which intents flipped pass/fail between the two runs?
81
+ function indexById(run) {
82
+ return new Map((run.results || []).map((r) => [r.id, r]));
83
+ }
84
+ const baseById = indexById(base);
85
+ const candById = indexById(cand);
86
+ const flips = { pass_to_fail: [], fail_to_pass: [] };
87
+ for (const [id, b] of baseById) {
88
+ const c = candById.get(id);
89
+ if (!c) continue;
90
+ if (b.pass && !c.pass) flips.pass_to_fail.push({ id, intent: b.intent, baseScore: b.validationScore, candCombined: c.combinedScore, candSemantic: c.semanticScore, candVerdict: c.semanticVerdict });
91
+ else if (!b.pass && c.pass) flips.fail_to_pass.push({ id, intent: b.intent, baseScore: b.validationScore, candCombined: c.combinedScore, candSemantic: c.semanticScore, candVerdict: c.semanticVerdict });
92
+ }
93
+
94
+ console.log(`## Pass-flip diagnostics\n`);
95
+ console.log(`- pass → fail (regressions to investigate): **${flips.pass_to_fail.length}**`);
96
+ console.log(`- fail → pass (improvements): **${flips.fail_to_pass.length}**\n`);
97
+
98
+ function flipTable(rows, header) {
99
+ if (!rows.length) return;
100
+ console.log(`### ${header}\n`);
101
+ console.log(`| intent | base validationScore | cand combinedScore | cand semanticScore | cand verdict |`);
102
+ console.log(`|---|---:|---:|---:|---|`);
103
+ for (const r of rows.slice(0, 20)) {
104
+ console.log(`| ${r.intent} | ${fmt(r.baseScore)} | ${fmt(r.candCombined)} | ${fmt(r.candSemantic)} | ${fmt(r.candVerdict)} |`);
105
+ }
106
+ if (rows.length > 20) console.log(`\n_(${rows.length - 20} more rows omitted)_`);
107
+ console.log();
108
+ }
109
+ flipTable(flips.pass_to_fail, 'Regressions (pass → fail)');
110
+ flipTable(flips.fail_to_pass, 'Improvements (fail → pass)');
111
+
112
+ console.log(`---\n`);
113
+ console.log(`_Generated by \`packages/a2ui/mcp/scripts/semantic-stats.mjs\`_`);
@@ -0,0 +1,266 @@
1
+ #!/usr/bin/env node
2
+ // Smoke test: issue-reporter (write, trace attach, auto-fire, coalesce, evalMode).
3
+ // Spec: docs/specs/genui-multiturn-architecture.md §3.5 + §4.6 + §6.4 + §11.
4
+
5
+ import {
6
+ reportIssue,
7
+ autoReport,
8
+ attachTrace,
9
+ createIssueAccumulator,
10
+ AUTO_FIRE_POLICY,
11
+ } from '../../compose/engines/zettel/issue-reporter.js';
12
+ import { StateCache } from '../../compose/engines/zettel/state-cache.js';
13
+ import { mkdtemp, readFile, rm, stat } from 'node:fs/promises';
14
+ import { tmpdir } from 'node:os';
15
+ import { join } from 'node:path';
16
+
17
+ let pass = 0, fail = 0;
18
+ const t = (label, ok, detail = '') => {
19
+ if (ok) { console.log(` ✓ ${label}`); pass++; }
20
+ else { console.log(` ✗ ${label} ${detail}`); fail++; }
21
+ };
22
+
23
+ const TMP = await mkdtemp(join(tmpdir(), 'a2ui-issues-'));
24
+ const ctx = {
25
+ storageRoot: TMP,
26
+ versionInfo: { mcp: '0.1.0', corpus: '0.0.6', engine: 'zettel', llm_adapter: 'stub', model: 'test' },
27
+ };
28
+
29
+ console.log(`Storage root: ${TMP}`);
30
+ console.log('\n=== reportIssue: basic write ===');
31
+
32
+ const r1 = await reportIssue({
33
+ type: 'bug',
34
+ severity: 'drift',
35
+ title: 'Test issue with simple title',
36
+ body: 'Test body content.',
37
+ }, { ...ctx, reporter: 'user' });
38
+
39
+ t('returns issue_id', !!r1.issue_id);
40
+ t('issue-id format YYYY-MM-DD-slug-rand4', /^\d{4}-\d{2}-\d{2}-[a-z0-9-]+-[a-f0-9]{4}$/.test(r1.issue_id));
41
+ t('returns ack: logged', r1.ack === 'logged');
42
+ t('returns absolute path', r1.path.startsWith(TMP));
43
+
44
+ const file1 = JSON.parse(await readFile(r1.path, 'utf8'));
45
+ t('written file has expected type', file1.type === 'bug');
46
+ t('written file has expected severity', file1.severity === 'drift');
47
+ t('written file has status open', file1.status === 'open');
48
+ t('written file has reporter.kind', file1.reporter.kind === 'user');
49
+ t('written file has environment', file1.environment?.mcp === '0.1.0');
50
+ t('written file has linked_specs default', Array.isArray(file1.linked_specs) && file1.linked_specs.length > 0);
51
+ t('related_issue_ids defaults to empty array', Array.isArray(file1.related_issue_ids) && file1.related_issue_ids.length === 0);
52
+ t('tags defaults to empty array', Array.isArray(file1.tags));
53
+ t('suggested_owner defaults to "unknown"', file1.suggested_owner === 'unknown');
54
+
55
+ console.log('\n=== validation guards ===');
56
+
57
+ let threw = false;
58
+ try { await reportIssue({ type: 'invalid', severity: 'drift', title: 't', body: 'b' }, ctx); }
59
+ catch (e) { threw = /type must be/.test(e.message); }
60
+ t('rejects invalid type', threw);
61
+
62
+ threw = false;
63
+ try { await reportIssue({ type: 'bug', severity: 'oops', title: 't', body: 'b' }, ctx); }
64
+ catch (e) { threw = /severity must be/.test(e.message); }
65
+ t('rejects invalid severity', threw);
66
+
67
+ threw = false;
68
+ try { await reportIssue({ type: 'bug', severity: 'drift', title: 'x'.repeat(81), body: 'b' }, ctx); }
69
+ catch (e) { threw = /≤ 80 chars/.test(e.message); }
70
+ t('rejects title > 80 chars', threw);
71
+
72
+ threw = false;
73
+ try { await reportIssue({ type: 'bug', severity: 'drift', title: 't', body: 'b', trace: 'partial' }, ctx); }
74
+ catch (e) { threw = /trace must be/.test(e.message); }
75
+ t('rejects invalid trace depth', threw);
76
+
77
+ threw = false;
78
+ try { await reportIssue({ type: 'bug', severity: 'drift', title: 't', body: 'b', tags: 'not-an-array' }, ctx); }
79
+ catch (e) { threw = /tags must be an array/.test(e.message); }
80
+ t('rejects non-array tags', threw);
81
+
82
+ console.log('\n=== state_id trace attachment ===');
83
+
84
+ const cache = new StateCache({ maxSize: 10 });
85
+ cache.set('dash-3f9a-v1-26042817', {
86
+ state_id: 'dash-3f9a-v1-26042817',
87
+ intent: 'admin dashboard',
88
+ tool: 'compose_from_chunks',
89
+ input: { intent: 'admin dashboard' },
90
+ output: { html: '<dashboard/>', plan: { page: 'dashboard-admin-page' } },
91
+ ops_history: [
92
+ { type: 'createSurface', surfaceId: 'main' },
93
+ { type: 'updateComponents', surfaceId: 'main', components: [] },
94
+ ],
95
+ delta_summary: 'Created admin dashboard',
96
+ warnings: [],
97
+ duration_ms: 1234,
98
+ internal: {
99
+ locator_prompt: 'PROMPT_LOCATOR_v1',
100
+ locator_response: 'RESPONSE_LOCATOR_v1',
101
+ modifier_prompt: 'PROMPT_MODIFIER_v1',
102
+ modifier_response: 'RESPONSE_MODIFIER_v1',
103
+ validator_results: [{ ok: true }],
104
+ retries: 0,
105
+ },
106
+ });
107
+
108
+ const traceSummary = await attachTrace('dash-3f9a-v1-26042817', 'summary', cache);
109
+ t('summary trace populates state_id', traceSummary?.state_id === 'dash-3f9a-v1-26042817');
110
+ t('summary trace populates input', traceSummary?.input?.intent === 'admin dashboard');
111
+ t('summary trace populates output.ops', Array.isArray(traceSummary?.output?.ops));
112
+ t('summary trace omits internal field', traceSummary?.internal === undefined);
113
+
114
+ const traceFull = await attachTrace('dash-3f9a-v1-26042817', 'full', cache);
115
+ t('full trace includes internal.locator_prompt', traceFull?.internal?.locator_prompt === 'PROMPT_LOCATOR_v1');
116
+ t('full trace includes internal.modifier_response', traceFull?.internal?.modifier_response === 'RESPONSE_MODIFIER_v1');
117
+ t('full trace includes validator_results', Array.isArray(traceFull?.internal?.validator_results));
118
+
119
+ const traceMiss = await attachTrace('not-a-real-id', 'summary', cache);
120
+ t('attachTrace returns null on cache miss', traceMiss === null);
121
+
122
+ const traceNoCache = await attachTrace('dash-3f9a-v1-26042817', 'summary', null);
123
+ t('attachTrace returns null when no cache', traceNoCache === null);
124
+
125
+ // reportIssue with state_id integrates trace
126
+ const r2 = await reportIssue({
127
+ type: 'bug',
128
+ severity: 'drift',
129
+ title: 'Issue tied to a state',
130
+ body: 'Reproducible on dashboard generation.',
131
+ state_id: 'dash-3f9a-v1-26042817',
132
+ trace: 'full',
133
+ }, { ...ctx, cache });
134
+ const file2 = JSON.parse(await readFile(r2.path, 'utf8'));
135
+ t('reportIssue with state_id+full attaches trace.input', file2.trace?.input?.intent === 'admin dashboard');
136
+ t('reportIssue with full trace attaches internal', file2.trace?.internal?.locator_prompt === 'PROMPT_LOCATOR_v1');
137
+
138
+ // peek does not touch recency — verify the cache state didn't bump dash- to most-recent.
139
+ // Insert a few more entries, ensure dash- stays the LRU candidate.
140
+ // (This is exercised indirectly: state-cache smoke covers peek-recency directly.)
141
+
142
+ console.log('\n=== oversized trace spills to sidecar ===');
143
+
144
+ cache.set('big-state', {
145
+ state_id: 'big-state',
146
+ intent: 'big',
147
+ tool: 'compose',
148
+ input: {},
149
+ ops_history: [],
150
+ internal: { huge_dump: 'x'.repeat(300 * 1024) },
151
+ });
152
+ const r3 = await reportIssue({
153
+ type: 'bug',
154
+ severity: 'drift',
155
+ title: 'Big trace test',
156
+ body: 'Has an oversized trace.',
157
+ state_id: 'big-state',
158
+ trace: 'full',
159
+ }, { ...ctx, cache });
160
+ const file3 = JSON.parse(await readFile(r3.path, 'utf8'));
161
+ t('oversized trace replaced by sidecar pointer', !!file3.trace?.sidecar);
162
+ t('sidecar path has expected shape', /^traces\/.+\.trace\.json$/.test(file3.trace.sidecar));
163
+ const sidecar = await stat(join(TMP, file3.trace.sidecar));
164
+ t('sidecar file exists with non-zero size', sidecar.size > 0);
165
+
166
+ console.log('\n=== autoReport: policy lookup ===');
167
+
168
+ const a1 = await autoReport('validator-exhausted', { tool: 'refine_composition' }, ctx);
169
+ const file_a1 = JSON.parse(await readFile(a1.path, 'utf8'));
170
+ t('validator-exhausted: type=bug', file_a1.type === 'bug');
171
+ t('validator-exhausted: severity=blocker', file_a1.severity === 'blocker');
172
+ t('validator-exhausted: suggested_owner=validator', file_a1.suggested_owner === 'validator');
173
+ t('validator-exhausted: reporter.kind=auto', file_a1.reporter.kind === 'auto');
174
+ t('validator-exhausted: reporter.context=validator-exhausted', file_a1.reporter.context === 'validator-exhausted');
175
+ t('validator-exhausted: tags include "auto-fire"', file_a1.tags.includes('auto-fire'));
176
+
177
+ const a2 = await autoReport('retrieval-zero-then-synthesis-fail', { intent: 'pricing page' }, ctx);
178
+ const file_a2 = JSON.parse(await readFile(a2.path, 'utf8'));
179
+ t('retrieval-zero-then-synthesis-fail: type=training-gap', file_a2.type === 'training-gap');
180
+ t('retrieval-zero-then-synthesis-fail: suggested_owner=chunk-corpus', file_a2.suggested_owner === 'chunk-corpus');
181
+ t('retrieval-zero-then-synthesis-fail: title carries intent', /pricing page/.test(file_a2.title));
182
+
183
+ const a3 = await autoReport('cache-miss-on-known-state', { state_id: 'gone-12ab-v1-0' }, ctx);
184
+ const file_a3 = JSON.parse(await readFile(a3.path, 'utf8'));
185
+ t('cache-miss-on-known-state: severity=nit', file_a3.severity === 'nit');
186
+
187
+ threw = false;
188
+ try { await autoReport('unknown-reason', {}, ctx); }
189
+ catch (e) { threw = /unknown reason/.test(e.message); }
190
+ t('autoReport rejects unknown reason', threw);
191
+
192
+ console.log('\n=== evalMode suppresses auto-fire ===');
193
+
194
+ const evalCtx = { ...ctx, evalMode: true };
195
+ const aSuppressed = await autoReport('validator-exhausted', { tool: 'refine_composition' }, evalCtx);
196
+ t('autoReport returns null when evalMode=true', aSuppressed === null);
197
+
198
+ // Manual reportIssue still writes during evalMode (eval-suppression is auto-fire only)
199
+ const aManual = await reportIssue({
200
+ type: 'bug',
201
+ severity: 'blocker',
202
+ title: 'Manual call during evalMode',
203
+ body: 'should still write',
204
+ }, evalCtx);
205
+ t('manual reportIssue ignores evalMode', !!aManual.issue_id);
206
+ const file_manual = JSON.parse(await readFile(aManual.path, 'utf8'));
207
+ t('manual call during evalMode writes file', file_manual.title === 'Manual call during evalMode');
208
+
209
+ console.log('\n=== coalescing accumulator ===');
210
+
211
+ const acc = createIssueAccumulator();
212
+ t('empty accumulator size 0', acc.size() === 0);
213
+ const flushEmpty = await acc.flush(ctx);
214
+ t('empty accumulator flush returns null', flushEmpty === null);
215
+
216
+ acc.add('locator-empty-targets', { intent: 'change title' });
217
+ t('single-entry accumulator size 1', acc.size() === 1);
218
+ const flushOne = await acc.flush(ctx);
219
+ const file_flush_one = JSON.parse(await readFile(flushOne.path, 'utf8'));
220
+ t('single-entry flush writes normal auto-issue', file_flush_one.reporter.kind === 'auto' && file_flush_one.reporter.context === 'locator-empty-targets');
221
+ t('single-entry flush resets accumulator', acc.size() === 0);
222
+
223
+ acc.add('locator-empty-targets', { intent: 'change title' });
224
+ acc.add('validator-exhausted', { tool: 'refine_composition' });
225
+ acc.add('ops-failed-after-apply', {});
226
+ t('three-entry accumulator size 3', acc.size() === 3);
227
+ const flushThree = await acc.flush(ctx);
228
+ const file_flush_three = JSON.parse(await readFile(flushThree.path, 'utf8'));
229
+ t('coalesced issue: severity=blocker (highest of three)', file_flush_three.severity === 'blocker');
230
+ t('coalesced issue: type=bug', file_flush_three.type === 'bug');
231
+ t('coalesced issue: reporter.context=coalesced', file_flush_three.reporter.context === 'coalesced');
232
+ t('coalesced issue: tags include "coalesced"', file_flush_three.tags.includes('coalesced'));
233
+ t('coalesced issue: tags include all reasons',
234
+ ['locator-empty-targets', 'validator-exhausted', 'ops-failed-after-apply'].every((r) => file_flush_three.tags.includes(r))
235
+ );
236
+ t('coalesced issue: body lists every reason',
237
+ file_flush_three.body.includes('locator-empty-targets') &&
238
+ file_flush_three.body.includes('validator-exhausted') &&
239
+ file_flush_three.body.includes('ops-failed-after-apply')
240
+ );
241
+ t('coalesced flush resets accumulator', acc.size() === 0);
242
+
243
+ // evalMode + coalesce → no write
244
+ const accEval = createIssueAccumulator();
245
+ accEval.add('validator-exhausted', {});
246
+ accEval.add('ops-failed-after-apply', {});
247
+ const flushEval = await accEval.flush({ ...ctx, evalMode: true });
248
+ t('coalesce flush returns null when evalMode=true', flushEval === null);
249
+
250
+ threw = false;
251
+ try { acc.add('not-a-real-reason'); }
252
+ catch (e) { threw = /unknown reason/.test(e.message); }
253
+ t('accumulator.add rejects unknown reason', threw);
254
+
255
+ console.log('\n=== AUTO_FIRE_POLICY exported ===');
256
+
257
+ t('AUTO_FIRE_POLICY exports expected reasons',
258
+ ['synthesizer-exhausted', 'validator-exhausted', 'locator-empty-targets',
259
+ 'retrieval-zero-then-synthesis-fail', 'cache-miss-on-known-state',
260
+ 'ops-failed-after-apply'].every((r) => AUTO_FIRE_POLICY[r])
261
+ );
262
+
263
+ await rm(TMP, { recursive: true, force: true });
264
+
265
+ console.log(`\n${pass} passed, ${fail} failed`);
266
+ process.exit(fail ? 1 : 0);