@adia-ai/a2ui-mcp 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +142 -0
- package/package.json +1 -1
- package/scripts/eval-diff.mjs +68 -6
- package/scripts/semantic-stats.mjs +113 -0
package/CHANGELOG.md
CHANGED
|
@@ -11,6 +11,148 @@ zettel strategies.
|
|
|
11
11
|
|
|
12
12
|
---
|
|
13
13
|
|
|
14
|
+
## [0.1.2] - 2026-05-01
|
|
15
|
+
|
|
16
|
+
Phase 2 of [`docs/specs/semantic-validator.md`](../../../docs/specs/semantic-validator.md)
|
|
17
|
+
**promoted to default**. When `--semantic` is set, `eval-diff.mjs`
|
|
18
|
+
now gates on the combined score at threshold 80 by default, instead
|
|
19
|
+
of running in Phase 1 shadow-mode. **Behavior change** — see § Opt
|
|
20
|
+
back into Phase 1 below.
|
|
21
|
+
|
|
22
|
+
### Changed (default behavior when `--semantic` is set)
|
|
23
|
+
|
|
24
|
+
- **`--gate-mode` default flipped from `structural` to `combined`
|
|
25
|
+
when `--semantic` is set.** `npm run eval:diff -- --engine zettel
|
|
26
|
+
--semantic` now produces a combined-gated `passRate` (gate on
|
|
27
|
+
`0.6 × validationScore + 0.4 × semanticScore`) instead of the
|
|
28
|
+
structural-only Phase 1 shadow output. The structural pass survives
|
|
29
|
+
as `row.passStructural` + `runObj.passRateStructural` for diagnostic
|
|
30
|
+
comparison.
|
|
31
|
+
- **`--gate-threshold` default raised from 70 to 80.** Chosen via
|
|
32
|
+
the 2026-05-01 sweep — 70 produced zero pass-flips on the current
|
|
33
|
+
zettel engine output (cosmetic flip with no signal); 80 catches the
|
|
34
|
+
4 partial-verdict cases the spec was designed to surface; 85
|
|
35
|
+
over-aggressively rejects legit aligned items. See spec § Phase 2
|
|
36
|
+
status (2026-05-01) for the sweep table.
|
|
37
|
+
- **Without `--semantic`, the default stays `structural`.** Semantic
|
|
38
|
+
work remains opt-in; the no-flag `npm run eval:diff` shape is
|
|
39
|
+
unchanged.
|
|
40
|
+
|
|
41
|
+
### Opt back into Phase 1 shadow-mode
|
|
42
|
+
|
|
43
|
+
- `--gate-mode structural` reverts the default flip — `row.pass`
|
|
44
|
+
gates on `validationScore` alone, semantic verdicts annotation-
|
|
45
|
+
only.
|
|
46
|
+
- `--gate-threshold 70` reverts the threshold change.
|
|
47
|
+
- Combine both for full v0.1.1 default behavior:
|
|
48
|
+
```bash
|
|
49
|
+
node packages/a2ui/mcp/scripts/eval-diff.mjs --engine zettel \
|
|
50
|
+
--semantic --gate-mode structural
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Why the promotion is safe
|
|
54
|
+
|
|
55
|
+
Per the Phase 2 exit criterion ("no unexplained regressions in first
|
|
56
|
+
two full `eval-diff` runs"), zero pass-flips were observed at
|
|
57
|
+
threshold=70 (the v0.1.1 default). At threshold=80, the four
|
|
58
|
+
pass→fail flips are all `partial`-verdict items that fit the spec's
|
|
59
|
+
motivating failure mode (structural=86–88 + semantic=54–69) — the
|
|
60
|
+
intents the structural-only validator scored highly but the LLM
|
|
61
|
+
judge correctly flagged as "wrong UI for this intent":
|
|
62
|
+
|
|
63
|
+
- `user profile card with avatar` — structural 88, semantic 54
|
|
64
|
+
- `calendar month view` — structural 86, semantic 65
|
|
65
|
+
- `social media post card` — structural 86, semantic 69
|
|
66
|
+
- `user onboarding checklist` — structural 87, semantic 67
|
|
67
|
+
|
|
68
|
+
Pass rate drops from 83 → 79 (4 percentage points), all justified.
|
|
69
|
+
|
|
70
|
+
### Updated baseline thresholds
|
|
71
|
+
|
|
72
|
+
Per [`docs/specs/semantic-validator.md`](../../../docs/specs/semantic-validator.md)
|
|
73
|
+
§ Phase 2 status (2026-05-01):
|
|
74
|
+
|
|
75
|
+
- `avgScore ≥ 88` (structural — unchanged)
|
|
76
|
+
- `avgSemanticScore ≥ 85` (new)
|
|
77
|
+
- `avgCombined ≥ 87` (new)
|
|
78
|
+
- `passRate ≥ 78%` at combined-gate threshold=80 (new; was 83%
|
|
79
|
+
structural-only)
|
|
80
|
+
- `verdict aligned ≥ 84%` of judged items
|
|
81
|
+
|
|
82
|
+
### Implementation references
|
|
83
|
+
|
|
84
|
+
- [`scripts/eval-diff.mjs`](scripts/eval-diff.mjs) — default flip
|
|
85
|
+
- [`scripts/semantic-stats.mjs`](scripts/semantic-stats.mjs) —
|
|
86
|
+
unchanged from v0.1.1; remains the validation tool for future
|
|
87
|
+
threshold revalidation passes
|
|
88
|
+
|
|
89
|
+
## [0.1.1] - 2026-05-01
|
|
90
|
+
|
|
91
|
+
Phase 2 of [`docs/specs/semantic-validator.md`](../../../docs/specs/semantic-validator.md)
|
|
92
|
+
— opt-in combined-gating in `eval-diff.mjs` + new
|
|
93
|
+
`semantic-stats.mjs` companion script. **No breaking changes.**
|
|
94
|
+
Default `eval-diff` behavior unchanged — Phase 1 shadow-mode is
|
|
95
|
+
still the default; combined gating is opt-in via flags.
|
|
96
|
+
|
|
97
|
+
### Added (`scripts/eval-diff.mjs` — Phase 2 gating flags)
|
|
98
|
+
|
|
99
|
+
- **`--gate-mode {structural|combined}`** — `structural` (default)
|
|
100
|
+
preserves Phase 1 shadow behavior: `row.pass` gates on
|
|
101
|
+
`validationScore` alone; semantic verdicts are annotation-only.
|
|
102
|
+
`combined` flips `row.pass` to gate on the combined score
|
|
103
|
+
(`round(0.6 × validationScore + 0.4 × semanticScore)`); preserves
|
|
104
|
+
the pre-flip pass as `row.passStructural`; recomputes
|
|
105
|
+
`runObj.passRate` + carries `runObj.passRateStructural` (baseline)
|
|
106
|
+
alongside; records `runObj.gateMode` + `runObj.gateThreshold`;
|
|
107
|
+
`diff.md` gains structural-baseline + avgSemantic + avgCombined
|
|
108
|
+
rows.
|
|
109
|
+
- **`--gate-threshold N`** — combined-mode threshold; default 70 to
|
|
110
|
+
match the existing structural threshold. Override per-run for
|
|
111
|
+
sweep-style tuning.
|
|
112
|
+
- **Validation gate** — combined-mode requires `--semantic`; the
|
|
113
|
+
script rejects the flag combination at startup so the operator
|
|
114
|
+
never silently ships the gating change without the scores it
|
|
115
|
+
needs.
|
|
116
|
+
|
|
117
|
+
### Added (`scripts/semantic-stats.mjs` — companion stats script)
|
|
118
|
+
|
|
119
|
+
- **New** — read-only; takes two run JSON paths
|
|
120
|
+
(`evals/mcp/runs/<stamp>/{mcp,zettel}.json`); emits markdown to
|
|
121
|
+
stdout with **verdict-distribution deltas + per-intent pass-flip
|
|
122
|
+
diagnostics** (which intents flipped pass→fail or fail→pass
|
|
123
|
+
between baseline and candidate). The tooling that satisfies the
|
|
124
|
+
"no unexplained regressions" exit criterion of Phase 2 §
|
|
125
|
+
Rollout before promoting combined gating to default.
|
|
126
|
+
|
|
127
|
+
### Procedure for promotion (deferred)
|
|
128
|
+
|
|
129
|
+
Promotion to default is deferred until two full eval-diff runs
|
|
130
|
+
(structural-only baseline + combined-gating candidate) have been
|
|
131
|
+
compared via `semantic-stats.mjs` and the regression count
|
|
132
|
+
justifies it. Procedure:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
# 1. Capture structural-only baseline (default behavior)
|
|
136
|
+
node packages/a2ui/mcp/scripts/eval-diff.mjs --engine zettel --semantic
|
|
137
|
+
|
|
138
|
+
# 2. Run the candidate (combined gating)
|
|
139
|
+
node packages/a2ui/mcp/scripts/eval-diff.mjs --engine zettel --semantic --gate-mode combined
|
|
140
|
+
|
|
141
|
+
# 3. Compare
|
|
142
|
+
node packages/a2ui/mcp/scripts/semantic-stats.mjs \
|
|
143
|
+
evals/mcp/runs/<baseline-stamp>/zettel.json \
|
|
144
|
+
evals/mcp/runs/<candidate-stamp>/zettel.json > /tmp/semantic-stats.md
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Implementation references
|
|
148
|
+
|
|
149
|
+
- [`scripts/eval-diff.mjs`](scripts/eval-diff.mjs)
|
|
150
|
+
- [`scripts/semantic-stats.mjs`](scripts/semantic-stats.mjs)
|
|
151
|
+
|
|
152
|
+
### Commits
|
|
153
|
+
|
|
154
|
+
- `8415ff9e` — `feat(validator): semantic Phase 2 — opt-in combined-gating + drift cleanup`
|
|
155
|
+
|
|
14
156
|
## [0.1.0] - 2026-04-28
|
|
15
157
|
|
|
16
158
|
**Multi-turn gen-UI tool surface (Phase A code-complete).** Adds three new
|
package/package.json
CHANGED
package/scripts/eval-diff.mjs
CHANGED
|
@@ -18,6 +18,9 @@
|
|
|
18
18
|
* node packages/a2ui/mcp/scripts/eval-diff.mjs --engine zettel # fragment-graph only
|
|
19
19
|
* node packages/a2ui/mcp/scripts/eval-diff.mjs --limit 20
|
|
20
20
|
* node packages/a2ui/mcp/scripts/eval-diff.mjs --domain forms
|
|
21
|
+
* node packages/a2ui/mcp/scripts/eval-diff.mjs --semantic # Phase 2 default: gates on combined score (>= 80)
|
|
22
|
+
* node packages/a2ui/mcp/scripts/eval-diff.mjs --semantic --gate-mode structural # Phase 1 shadow-mode (opt-back-in)
|
|
23
|
+
* node packages/a2ui/mcp/scripts/eval-diff.mjs --semantic --gate-threshold 75 # Override default threshold
|
|
21
24
|
*/
|
|
22
25
|
import '../../../../scripts/load-env.mjs';
|
|
23
26
|
|
|
@@ -42,8 +45,30 @@ const opt = (k) => {
|
|
|
42
45
|
const engine = opt('engine') || 'all';
|
|
43
46
|
const limit = opt('limit') ? Number(opt('limit')) : undefined;
|
|
44
47
|
const domain = opt('domain');
|
|
45
|
-
// Shadow-mode semantic validator (Phase 1). Opt-in; zero effect on gating
|
|
48
|
+
// Shadow-mode semantic validator (Phase 1). Opt-in; zero effect on gating
|
|
49
|
+
// when --gate-mode=structural (default).
|
|
46
50
|
const semanticEnabled = args.includes('--semantic');
|
|
51
|
+
// Phase 2 (gating mode). Default depends on --semantic:
|
|
52
|
+
// without --semantic → 'structural' (no semantic work; Phase 1 baseline).
|
|
53
|
+
// with --semantic → 'combined' since v0.1.2 (Phase 2 promotion); was
|
|
54
|
+
// 'structural' / shadow-mode in v0.1.1.
|
|
55
|
+
// Opt-back-in to shadow with --gate-mode structural.
|
|
56
|
+
// Combined mode gates on (0.6 * validationScore + 0.4 * semanticScore) at
|
|
57
|
+
// threshold 80 — chosen via the 2026-05-01 sweep (70 produced zero
|
|
58
|
+
// pass-flips; 80 catches the 4 partial-verdict cases the spec was designed
|
|
59
|
+
// to surface; 85 over-aggressively rejects aligned items).
|
|
60
|
+
// Override threshold with --gate-threshold N.
|
|
61
|
+
const gateModeDefault = args.includes('--semantic') ? 'combined' : 'structural';
|
|
62
|
+
const gateMode = opt('gate-mode') || gateModeDefault;
|
|
63
|
+
const gateThreshold = opt('gate-threshold') ? Number(opt('gate-threshold')) : 80;
|
|
64
|
+
if (!['structural', 'combined'].includes(gateMode)) {
|
|
65
|
+
console.error(`[eval-diff] --gate-mode must be one of: structural | combined (got: ${gateMode})`);
|
|
66
|
+
process.exit(2);
|
|
67
|
+
}
|
|
68
|
+
if (gateMode === 'combined' && !semanticEnabled) {
|
|
69
|
+
console.error(`[eval-diff] --gate-mode=combined requires --semantic (semantic scores must be computed before gating on them)`);
|
|
70
|
+
process.exit(2);
|
|
71
|
+
}
|
|
47
72
|
|
|
48
73
|
if (!['mcp', 'zettel', 'all'].includes(engine)) {
|
|
49
74
|
console.error(`[eval-diff] --engine must be one of: mcp | zettel | all (got: ${engine})`);
|
|
@@ -144,7 +169,12 @@ async function annotateSemantic(runObj, label) {
|
|
|
144
169
|
row.rubricVersion = v.rubricVersion;
|
|
145
170
|
const structural = row.validationScore ?? 0;
|
|
146
171
|
row.combinedScore = Math.round(0.6 * structural + 0.4 * v.score);
|
|
147
|
-
//
|
|
172
|
+
// Phase 2: when gateMode === 'combined', flip row.pass to gate on the
|
|
173
|
+
// combined score. Preserves the structural pass for diagnostic purposes.
|
|
174
|
+
if (gateMode === 'combined') {
|
|
175
|
+
row.passStructural = row.pass;
|
|
176
|
+
row.pass = row.combinedScore >= gateThreshold;
|
|
177
|
+
}
|
|
148
178
|
if (!v.error) {
|
|
149
179
|
semSum += v.score;
|
|
150
180
|
semN += 1;
|
|
@@ -164,7 +194,8 @@ async function annotateSemantic(runObj, label) {
|
|
|
164
194
|
}
|
|
165
195
|
runObj.semantic = {
|
|
166
196
|
enabled: true,
|
|
167
|
-
mode: 'shadow',
|
|
197
|
+
mode: gateMode === 'combined' ? 'gating' : 'shadow',
|
|
198
|
+
gateThreshold: gateMode === 'combined' ? gateThreshold : null,
|
|
168
199
|
judged: semN,
|
|
169
200
|
errors,
|
|
170
201
|
cached,
|
|
@@ -174,14 +205,31 @@ async function annotateSemantic(runObj, label) {
|
|
|
174
205
|
tokens: { input: tokensIn, output: tokensOut },
|
|
175
206
|
rubricVersion: 'v1',
|
|
176
207
|
};
|
|
177
|
-
|
|
208
|
+
// Phase 2: when gateMode === 'combined', recompute pass aggregates so
|
|
209
|
+
// runObj.passRate / runObj.pass reflect the new gate. Capture the
|
|
210
|
+
// structural-only pass count alongside for diagnostic comparison.
|
|
211
|
+
if (gateMode === 'combined') {
|
|
212
|
+
const structuralPassCount = runObj.results.filter((r) => r.passStructural).length;
|
|
213
|
+
const combinedPassCount = runObj.results.filter((r) => r.pass).length;
|
|
214
|
+
runObj.passStructural = structuralPassCount;
|
|
215
|
+
runObj.passRateStructural = Math.round((structuralPassCount / (runObj.results.length || 1)) * 100);
|
|
216
|
+
runObj.pass = combinedPassCount;
|
|
217
|
+
runObj.passRate = Math.round((combinedPassCount / (runObj.results.length || 1)) * 100);
|
|
218
|
+
runObj.gateMode = 'combined';
|
|
219
|
+
runObj.gateThreshold = gateThreshold;
|
|
220
|
+
}
|
|
221
|
+
const modeLabel = gateMode === 'combined' ? `gating(>=${gateThreshold})` : 'shadow';
|
|
222
|
+
console.error(`[semantic:${label}] mode=${modeLabel} judged=${semN} avgSem=${runObj.semantic.avgSemanticScore} avgCombined=${runObj.semantic.avgCombinedScore} cached=${cached} errors=${errors} tokens=${tokensIn}+${tokensOut}`);
|
|
178
223
|
}
|
|
179
224
|
|
|
180
225
|
if (semanticEnabled) {
|
|
181
226
|
if (!process.env.ANTHROPIC_API_KEY) {
|
|
182
227
|
console.error('[eval-diff] --semantic requested but ANTHROPIC_API_KEY missing; skipping.');
|
|
183
228
|
} else {
|
|
184
|
-
|
|
229
|
+
const modeNote = gateMode === 'combined'
|
|
230
|
+
? `gating mode (combined threshold=${gateThreshold})`
|
|
231
|
+
: 'shadow mode';
|
|
232
|
+
console.error(`[eval-diff] running semantic validator (${modeNote})…`);
|
|
185
233
|
if (mcp) await annotateSemantic(mcp, 'mcp');
|
|
186
234
|
if (zettel) await annotateSemantic(zettel, 'zettel');
|
|
187
235
|
}
|
|
@@ -209,7 +257,11 @@ md += `# Engine Eval ${mcp && zettel ? 'Diff' : 'Report'}\n\n`;
|
|
|
209
257
|
md += `- Run: \`${stamp}\`\n`;
|
|
210
258
|
md += `- Engine(s): ${engine}\n`;
|
|
211
259
|
md += `- Intents: ${(mcp || zettel).total}${domain ? ` (domain: ${domain})` : ''}${limit ? ` (limit: ${limit})` : ''}\n`;
|
|
212
|
-
md += `- Mode: instant\n
|
|
260
|
+
md += `- Mode: instant\n`;
|
|
261
|
+
if (semanticEnabled) {
|
|
262
|
+
md += `- Semantic: ${gateMode === 'combined' ? `gating (threshold=${gateThreshold})` : 'shadow'}\n`;
|
|
263
|
+
}
|
|
264
|
+
md += `\n`;
|
|
213
265
|
|
|
214
266
|
md += `## Aggregates\n\n`;
|
|
215
267
|
if (mcp && zettel) {
|
|
@@ -219,6 +271,11 @@ if (mcp && zettel) {
|
|
|
219
271
|
md += `| avgScore (emitted only) | ${mcp.avgScoreWhenEmitted} | ${zettel.avgScoreWhenEmitted} |\n`;
|
|
220
272
|
md += `| avgF1 (emitted only) | ${mcp.avgF1WhenEmitted} | ${zettel.avgF1WhenEmitted} |\n`;
|
|
221
273
|
md += `| pass rate % | ${mcp.passRate} | ${zettel.passRate} |\n`;
|
|
274
|
+
if (mcp.gateMode === 'combined' || zettel.gateMode === 'combined') {
|
|
275
|
+
md += `| pass rate % (structural-only baseline) | ${fmt(mcp.passRateStructural)} | ${fmt(zettel.passRateStructural)} |\n`;
|
|
276
|
+
md += `| avgSemanticScore | ${fmt(mcp.semantic?.avgSemanticScore)} | ${fmt(zettel.semantic?.avgSemanticScore)} |\n`;
|
|
277
|
+
md += `| avgCombinedScore | ${fmt(mcp.semantic?.avgCombinedScore)} | ${fmt(zettel.semantic?.avgCombinedScore)} |\n`;
|
|
278
|
+
}
|
|
222
279
|
md += `| retrieval MRR | ${fmt(mcp.retrievalMRR)} | ${fmt(zettel.retrievalMRR)} |\n\n`;
|
|
223
280
|
} else {
|
|
224
281
|
const e = mcp || zettel;
|
|
@@ -229,6 +286,11 @@ if (mcp && zettel) {
|
|
|
229
286
|
md += `| avgScore (emitted only) | ${e.avgScoreWhenEmitted} |\n`;
|
|
230
287
|
md += `| avgF1 (emitted only) | ${e.avgF1WhenEmitted} |\n`;
|
|
231
288
|
md += `| pass rate % | ${e.passRate} |\n`;
|
|
289
|
+
if (e.gateMode === 'combined') {
|
|
290
|
+
md += `| pass rate % (structural-only baseline) | ${fmt(e.passRateStructural)} |\n`;
|
|
291
|
+
md += `| avgSemanticScore | ${fmt(e.semantic?.avgSemanticScore)} |\n`;
|
|
292
|
+
md += `| avgCombinedScore | ${fmt(e.semantic?.avgCombinedScore)} |\n`;
|
|
293
|
+
}
|
|
232
294
|
md += `| retrieval MRR | ${fmt(e.retrievalMRR)} |\n\n`;
|
|
233
295
|
}
|
|
234
296
|
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Compare two eval-diff run artifacts (JSON) and surface verdict-distribution
|
|
4
|
+
* + pass-rate deltas. Used to verify "no unexplained regressions" between
|
|
5
|
+
* baseline and a candidate gating-mode run, per Phase 2 exit criteria of
|
|
6
|
+
* `docs/specs/semantic-validator.md`.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* node packages/a2ui/mcp/scripts/semantic-stats.mjs <baseline.json> <candidate.json>
|
|
10
|
+
*
|
|
11
|
+
* Examples:
|
|
12
|
+
* # Compare structural-only baseline to combined-gating candidate
|
|
13
|
+
* node packages/a2ui/mcp/scripts/semantic-stats.mjs \
|
|
14
|
+
* evals/mcp/runs/<baseline-stamp>/zettel.json \
|
|
15
|
+
* evals/mcp/runs/<candidate-stamp>/zettel.json
|
|
16
|
+
*
|
|
17
|
+
* The script is read-only; it writes nothing to disk. Output is markdown
|
|
18
|
+
* to stdout — pipe into a file or a journal entry as needed.
|
|
19
|
+
*/
|
|
20
|
+
import { readFile } from 'node:fs/promises';
|
|
21
|
+
|
|
22
|
+
const args = process.argv.slice(2);
|
|
23
|
+
if (args.length !== 2) {
|
|
24
|
+
console.error('Usage: semantic-stats.mjs <baseline.json> <candidate.json>');
|
|
25
|
+
process.exit(2);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const [basePath, candPath] = args;
|
|
29
|
+
const base = JSON.parse(await readFile(basePath, 'utf8'));
|
|
30
|
+
const cand = JSON.parse(await readFile(candPath, 'utf8'));
|
|
31
|
+
|
|
32
|
+
const fmt = (v) => v == null ? '—' : String(v);
|
|
33
|
+
const delta = (a, b) => {
|
|
34
|
+
if (a == null || b == null) return '—';
|
|
35
|
+
const d = Math.round((b - a) * 10) / 10;
|
|
36
|
+
return d > 0 ? `+${d}` : String(d);
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
console.log(`# Semantic stats — baseline vs candidate\n`);
|
|
40
|
+
console.log(`- Baseline: \`${basePath}\` (label=${fmt(base.label)})`);
|
|
41
|
+
console.log(`- Candidate: \`${candPath}\` (label=${fmt(cand.label)})\n`);
|
|
42
|
+
|
|
43
|
+
console.log(`## Aggregate deltas\n`);
|
|
44
|
+
console.log(`| metric | baseline | candidate | delta |`);
|
|
45
|
+
console.log(`|---|---:|---:|---:|`);
|
|
46
|
+
console.log(`| total | ${base.total} | ${cand.total} | ${delta(base.total, cand.total)} |`);
|
|
47
|
+
console.log(`| coverage % | ${base.coverage} | ${cand.coverage} | ${delta(base.coverage, cand.coverage)} |`);
|
|
48
|
+
console.log(`| emitted | ${base.emitted}/${base.total} | ${cand.emitted}/${cand.total} | ${delta(base.emitted, cand.emitted)} |`);
|
|
49
|
+
console.log(`| avgScoreWhenEmitted | ${base.avgScoreWhenEmitted} | ${cand.avgScoreWhenEmitted} | ${delta(base.avgScoreWhenEmitted, cand.avgScoreWhenEmitted)} |`);
|
|
50
|
+
console.log(`| avgF1WhenEmitted | ${base.avgF1WhenEmitted} | ${cand.avgF1WhenEmitted} | ${delta(base.avgF1WhenEmitted, cand.avgF1WhenEmitted)} |`);
|
|
51
|
+
console.log(`| pass rate % | ${base.passRate} | ${cand.passRate} | ${delta(base.passRate, cand.passRate)} |`);
|
|
52
|
+
|
|
53
|
+
if (cand.passRateStructural != null) {
|
|
54
|
+
console.log(`| pass rate % (structural-only on candidate) | ${base.passRate} | ${cand.passRateStructural} | ${delta(base.passRate, cand.passRateStructural)} |`);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if (base.semantic || cand.semantic) {
|
|
58
|
+
console.log(`| avgSemanticScore | ${fmt(base.semantic?.avgSemanticScore)} | ${fmt(cand.semantic?.avgSemanticScore)} | ${delta(base.semantic?.avgSemanticScore, cand.semantic?.avgSemanticScore)} |`);
|
|
59
|
+
console.log(`| avgCombinedScore | ${fmt(base.semantic?.avgCombinedScore)} | ${fmt(cand.semantic?.avgCombinedScore)} | ${delta(base.semantic?.avgCombinedScore, cand.semantic?.avgCombinedScore)} |`);
|
|
60
|
+
}
|
|
61
|
+
console.log();
|
|
62
|
+
|
|
63
|
+
// Verdict distribution (when --semantic was used in either run)
|
|
64
|
+
function verdicts(run) {
|
|
65
|
+
return run.semantic?.verdictBreakdown || null;
|
|
66
|
+
}
|
|
67
|
+
const bv = verdicts(base);
|
|
68
|
+
const cv = verdicts(cand);
|
|
69
|
+
if (bv || cv) {
|
|
70
|
+
console.log(`## Verdict distribution\n`);
|
|
71
|
+
const allKeys = new Set([...(bv ? Object.keys(bv) : []), ...(cv ? Object.keys(cv) : [])]);
|
|
72
|
+
console.log(`| verdict | baseline | candidate | delta |`);
|
|
73
|
+
console.log(`|---|---:|---:|---:|`);
|
|
74
|
+
for (const k of [...allKeys].sort()) {
|
|
75
|
+
console.log(`| ${k} | ${fmt(bv?.[k] ?? 0)} | ${fmt(cv?.[k] ?? 0)} | ${delta(bv?.[k] ?? 0, cv?.[k] ?? 0)} |`);
|
|
76
|
+
}
|
|
77
|
+
console.log();
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Per-row pass-flip surface — which intents flipped pass/fail between the two runs?
|
|
81
|
+
function indexById(run) {
|
|
82
|
+
return new Map((run.results || []).map((r) => [r.id, r]));
|
|
83
|
+
}
|
|
84
|
+
const baseById = indexById(base);
|
|
85
|
+
const candById = indexById(cand);
|
|
86
|
+
const flips = { pass_to_fail: [], fail_to_pass: [] };
|
|
87
|
+
for (const [id, b] of baseById) {
|
|
88
|
+
const c = candById.get(id);
|
|
89
|
+
if (!c) continue;
|
|
90
|
+
if (b.pass && !c.pass) flips.pass_to_fail.push({ id, intent: b.intent, baseScore: b.validationScore, candCombined: c.combinedScore, candSemantic: c.semanticScore, candVerdict: c.semanticVerdict });
|
|
91
|
+
else if (!b.pass && c.pass) flips.fail_to_pass.push({ id, intent: b.intent, baseScore: b.validationScore, candCombined: c.combinedScore, candSemantic: c.semanticScore, candVerdict: c.semanticVerdict });
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
console.log(`## Pass-flip diagnostics\n`);
|
|
95
|
+
console.log(`- pass → fail (regressions to investigate): **${flips.pass_to_fail.length}**`);
|
|
96
|
+
console.log(`- fail → pass (improvements): **${flips.fail_to_pass.length}**\n`);
|
|
97
|
+
|
|
98
|
+
function flipTable(rows, header) {
|
|
99
|
+
if (!rows.length) return;
|
|
100
|
+
console.log(`### ${header}\n`);
|
|
101
|
+
console.log(`| intent | base validationScore | cand combinedScore | cand semanticScore | cand verdict |`);
|
|
102
|
+
console.log(`|---|---:|---:|---:|---|`);
|
|
103
|
+
for (const r of rows.slice(0, 20)) {
|
|
104
|
+
console.log(`| ${r.intent} | ${fmt(r.baseScore)} | ${fmt(r.candCombined)} | ${fmt(r.candSemantic)} | ${fmt(r.candVerdict)} |`);
|
|
105
|
+
}
|
|
106
|
+
if (rows.length > 20) console.log(`\n_(${rows.length - 20} more rows omitted)_`);
|
|
107
|
+
console.log();
|
|
108
|
+
}
|
|
109
|
+
flipTable(flips.pass_to_fail, 'Regressions (pass → fail)');
|
|
110
|
+
flipTable(flips.fail_to_pass, 'Improvements (fail → pass)');
|
|
111
|
+
|
|
112
|
+
console.log(`---\n`);
|
|
113
|
+
console.log(`_Generated by \`packages/a2ui/mcp/scripts/semantic-stats.mjs\`_`);
|