@adia-ai/a2ui-mcp 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/package.json +1 -1
- package/scripts/shadow-compare-classify.mjs +139 -0
package/CHANGELOG.md
CHANGED
|
@@ -11,6 +11,18 @@ zettel strategies.
|
|
|
11
11
|
|
|
12
12
|
_No pending changes._
|
|
13
13
|
|
|
14
|
+
## [0.4.2] - 2026-05-11
|
|
15
|
+
|
|
16
|
+
### Added
|
|
17
|
+
|
|
18
|
+
- `scripts/shadow-compare-classify.mjs` — Phase 3 shadow-compare classifier tool (139 lines). Compares the deterministic `classifyIntent` + `scoreAgainstSpec` (from `@adia-ai/a2ui-validator` Phase 3 foundation, shipped in v0.4.1) against the LLM judge baseline. Used to identify IntentSpecs eligible for fast-path promotion (`≥ 95%` agreement threshold per spec § 3c). First report at `docs/reports/semantic-shadow-compare-2026-05-10.md` flagged `auth.signin` + `auth.signup` as eligible. Surfaces alongside `npm run semantic:shadow-compare`.
|
|
19
|
+
|
|
20
|
+
### Ride-along (otherwise no source changes)
|
|
21
|
+
|
|
22
|
+
Lockstep PATCH cut alongside `@adia-ai/web-components@0.4.2` (`<input-ui type="number">` rewrite drops native `<input type=number>` wrapping) + `@adia-ai/web-modules@0.4.2` (`<editor-sidebar>` grid-track width-mirror fix + new `<theme-panel>` module). Apart from the script above, source byte-identical to v0.4.1.
|
|
23
|
+
|
|
24
|
+
Internal `@adia-ai/*` dep ranges stay at `^0.4.0` (patch-cut asymmetry — `^0.4.0` covers `0.4.x` under semver). See root [CHANGELOG.md `## [0.4.2]`](../../../CHANGELOG.md) for the cut narrative.
|
|
25
|
+
|
|
14
26
|
## [0.4.1] - 2026-05-10
|
|
15
27
|
|
|
16
28
|
### Ride-along (no source changes)
|
package/package.json
CHANGED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* shadow-compare-classify.mjs — Phase 3 shadow-compare tooling
|
|
4
|
+
*
|
|
5
|
+
* For each row in an eval-diff run JSON that has semanticVerdict (LLM judge):
|
|
6
|
+
* 1. Run classifyIntent() from the rule classifier on the row's intent.
|
|
7
|
+
* 2. Bucket the LLM verdict + rule verdict + intent-spec match.
|
|
8
|
+
* 3. Report agreement rate per spec + overall.
|
|
9
|
+
*
|
|
10
|
+
* Goal: identify which IntentSpecs the rule classifier agrees with the LLM
|
|
11
|
+
* judge on ≥ 95% of the time. Those graduate to fast-path per
|
|
12
|
+
* docs/specs/semantic-validator.md § Phase 3 exit criteria.
|
|
13
|
+
*
|
|
14
|
+
* Note: the rule SCORER (scoreAgainstSpec) needs the emitted A2UI tree,
|
|
15
|
+
* which is not stored in eval-diff archives. This tool measures only
|
|
16
|
+
* **intent classification agreement** — does the regex hit the intent
|
|
17
|
+
* class, and is that class consistent with the LLM's verdict on the row?
|
|
18
|
+
*
|
|
19
|
+
* Future: when eval-diff starts archiving emitted_a2ui, extend this script
|
|
20
|
+
* to score the tree against the spec + compare full verdicts.
|
|
21
|
+
*
|
|
22
|
+
* Usage:
|
|
23
|
+
* node packages/a2ui/mcp/scripts/shadow-compare-classify.mjs <run.json>
|
|
24
|
+
*
|
|
25
|
+
* # Or multiple runs aggregated:
|
|
26
|
+
* node packages/a2ui/mcp/scripts/shadow-compare-classify.mjs \\
|
|
27
|
+
* evals/mcp/runs/2026-04-19T-mcp.json evals/mcp/runs/2026-05-10T-mcp.json
|
|
28
|
+
*
|
|
29
|
+
* Output: markdown report to stdout — agreement table per spec + overall.
|
|
30
|
+
* Exit 0 always (this is read-only diagnostics).
|
|
31
|
+
*/
|
|
32
|
+
import { readFile } from 'node:fs/promises';
|
|
33
|
+
import { classifyIntent } from '../../validator/semantic/classify-intent.js';
|
|
34
|
+
|
|
35
|
+
const args = process.argv.slice(2);
|
|
36
|
+
if (args.length === 0) {
|
|
37
|
+
console.error('Usage: shadow-compare-classify.mjs <run.json> [<run2.json> ...]');
|
|
38
|
+
process.exit(2);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** Aggregator: { specId → { agreed, disagreed, unmatched, total } } */
|
|
42
|
+
const stats = new Map();
|
|
43
|
+
let totalRows = 0;
|
|
44
|
+
let rowsWithSemantic = 0;
|
|
45
|
+
let rowsWithIntentMatch = 0;
|
|
46
|
+
|
|
47
|
+
function ensureBucket(id) {
|
|
48
|
+
if (!stats.has(id)) {
|
|
49
|
+
stats.set(id, { agreed: 0, disagreed: 0, partial: 0, total: 0, examples: [] });
|
|
50
|
+
}
|
|
51
|
+
return stats.get(id);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
for (const path of args) {
|
|
55
|
+
const raw = await readFile(path, 'utf8');
|
|
56
|
+
const data = JSON.parse(raw);
|
|
57
|
+
if (!data.results) continue;
|
|
58
|
+
|
|
59
|
+
for (const row of data.results) {
|
|
60
|
+
totalRows += 1;
|
|
61
|
+
if (!row.semanticVerdict) continue; // no LLM verdict to compare
|
|
62
|
+
rowsWithSemantic += 1;
|
|
63
|
+
|
|
64
|
+
const { spec, confidence } = classifyIntent(row.intent);
|
|
65
|
+
if (!spec || confidence < 0.9) {
|
|
66
|
+
// Rule classifier didn't recognize the intent — count under "unmatched"
|
|
67
|
+
const b = ensureBucket('__UNMATCHED__');
|
|
68
|
+
b.total += 1;
|
|
69
|
+
if (row.semanticVerdict === 'aligned') b.agreed += 1;
|
|
70
|
+
else if (row.semanticVerdict === 'misaligned') b.disagreed += 1;
|
|
71
|
+
else b.partial += 1;
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
rowsWithIntentMatch += 1;
|
|
76
|
+
const b = ensureBucket(spec.id);
|
|
77
|
+
b.total += 1;
|
|
78
|
+
|
|
79
|
+
// Agreement heuristic:
|
|
80
|
+
// - If the rule classifier recognized the intent class, the rule
|
|
81
|
+
// "votes" pass (the intent is a known shape with known structural
|
|
82
|
+
// requirements). Agreement with LLM = LLM verdict is "aligned".
|
|
83
|
+
// - Disagreement = LLM verdict is "misaligned".
|
|
84
|
+
// - "partial" is neither — counted separately.
|
|
85
|
+
if (row.semanticVerdict === 'aligned') b.agreed += 1;
|
|
86
|
+
else if (row.semanticVerdict === 'misaligned') {
|
|
87
|
+
b.disagreed += 1;
|
|
88
|
+
if (b.examples.length < 3) {
|
|
89
|
+
b.examples.push({ id: row.id, intent: row.intent.slice(0, 80), rationale: (row.semanticRationale || '').slice(0, 200) });
|
|
90
|
+
}
|
|
91
|
+
} else b.partial += 1;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// ── Render report ───────────────────────────────────────────────────
|
|
96
|
+
console.log(`# Semantic-validator Phase 3 — shadow-compare report\n`);
|
|
97
|
+
console.log(`Compared **${args.length}** eval run(s); ${rowsWithSemantic}/${totalRows} rows had semanticVerdict; ${rowsWithIntentMatch} rows matched a known IntentSpec.\n`);
|
|
98
|
+
|
|
99
|
+
const overallAgreed = [...stats.values()].reduce((a, b) => a + b.agreed, 0);
|
|
100
|
+
const overallTotal = [...stats.values()].reduce((a, b) => a + b.total, 0);
|
|
101
|
+
const overallPct = overallTotal > 0 ? Math.round((overallAgreed / overallTotal) * 100) : 0;
|
|
102
|
+
console.log(`**Overall classifier agreement** (matched + unmatched): ${overallAgreed}/${overallTotal} = **${overallPct}%**\n`);
|
|
103
|
+
|
|
104
|
+
console.log(`## Per-spec agreement\n`);
|
|
105
|
+
console.log(`| spec | agreed | disagreed | partial | total | agreement % | promotion eligible (≥95%)? |`);
|
|
106
|
+
console.log(`|---|---:|---:|---:|---:|---:|:---:|`);
|
|
107
|
+
|
|
108
|
+
const sorted = [...stats.entries()].sort((a, b) => {
|
|
109
|
+
// Unmatched last; otherwise descending by total
|
|
110
|
+
if (a[0] === '__UNMATCHED__') return 1;
|
|
111
|
+
if (b[0] === '__UNMATCHED__') return -1;
|
|
112
|
+
return b[1].total - a[1].total;
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
for (const [id, b] of sorted) {
|
|
116
|
+
const pct = b.total > 0 ? Math.round((b.agreed / b.total) * 100) : 0;
|
|
117
|
+
const eligible = id !== '__UNMATCHED__' && b.total >= 5 && pct >= 95 ? '✅ promote' : (b.total < 5 ? '— insufficient data' : '—');
|
|
118
|
+
console.log(`| \`${id === '__UNMATCHED__' ? '(no spec match)' : id}\` | ${b.agreed} | ${b.disagreed} | ${b.partial} | ${b.total} | ${pct}% | ${eligible} |`);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// ── Disagreement samples ────────────────────────────────────────────
|
|
122
|
+
const withDisagreements = sorted.filter(([id, b]) => id !== '__UNMATCHED__' && b.examples.length > 0);
|
|
123
|
+
if (withDisagreements.length > 0) {
|
|
124
|
+
console.log(`\n## Disagreement samples\n`);
|
|
125
|
+
console.log(`Rows where the rule classifier matched a known IntentSpec but the LLM judged the row 'misaligned'. Investigate before promoting these specs.\n`);
|
|
126
|
+
for (const [id, b] of withDisagreements) {
|
|
127
|
+
console.log(`### ${id}\n`);
|
|
128
|
+
for (const ex of b.examples) {
|
|
129
|
+
console.log(`- **${ex.id}** \`"${ex.intent}"\``);
|
|
130
|
+
if (ex.rationale) console.log(` - LLM rationale: ${ex.rationale}`);
|
|
131
|
+
}
|
|
132
|
+
console.log();
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
console.log(`\n---\n`);
|
|
137
|
+
console.log(`*Generated by \`shadow-compare-classify.mjs\` from \`packages/a2ui/validator/semantic/classify-intent.js\`. See [docs/specs/semantic-validator.md § Phase 3](../../../docs/specs/semantic-validator.md) for exit criteria.*`);
|
|
138
|
+
|
|
139
|
+
process.exit(0);
|