@adia-ai/a2ui-mcp 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +75 -0
- package/package.json +1 -1
- package/scripts/eval-diff.mjs +16 -10
package/CHANGELOG.md
CHANGED
|
@@ -11,6 +11,81 @@ zettel strategies.
|
|
|
11
11
|
|
|
12
12
|
---
|
|
13
13
|
|
|
14
|
+
## [0.1.2] - 2026-05-01
|
|
15
|
+
|
|
16
|
+
Phase 2 of [`docs/specs/semantic-validator.md`](../../../docs/specs/semantic-validator.md)
|
|
17
|
+
**promoted to default**. When `--semantic` is set, `eval-diff.mjs`
|
|
18
|
+
now gates on the combined score at threshold 80 by default, instead
|
|
19
|
+
of running in Phase 1 shadow-mode. **Behavior change** — see § Opt
|
|
20
|
+
back into Phase 1 below.
|
|
21
|
+
|
|
22
|
+
### Changed (default behavior when `--semantic` is set)
|
|
23
|
+
|
|
24
|
+
- **`--gate-mode` default flipped from `structural` to `combined`
|
|
25
|
+
when `--semantic` is set.** `npm run eval:diff -- --engine zettel
|
|
26
|
+
--semantic` now produces a combined-gated `passRate` (gate on
|
|
27
|
+
`0.6 × validationScore + 0.4 × semanticScore`) instead of the
|
|
28
|
+
structural-only Phase 1 shadow output. The structural pass survives
|
|
29
|
+
as `row.passStructural` + `runObj.passRateStructural` for diagnostic
|
|
30
|
+
comparison.
|
|
31
|
+
- **`--gate-threshold` default raised from 70 to 80.** Chosen via
|
|
32
|
+
the 2026-05-01 sweep — 70 produced zero pass-flips on the current
|
|
33
|
+
zettel engine output (cosmetic flip with no signal); 80 catches the
|
|
34
|
+
4 partial-verdict cases the spec was designed to surface; 85
|
|
35
|
+
over-aggressively rejects legit aligned items. See spec § Phase 2
|
|
36
|
+
status (2026-05-01) for the sweep table.
|
|
37
|
+
- **Without `--semantic`, the default stays `structural`.** Semantic
|
|
38
|
+
work remains opt-in; the no-flag `npm run eval:diff` shape is
|
|
39
|
+
unchanged.
|
|
40
|
+
|
|
41
|
+
### Opt back into Phase 1 shadow-mode
|
|
42
|
+
|
|
43
|
+
- `--gate-mode structural` reverts the default flip — `row.pass`
|
|
44
|
+
gates on `validationScore` alone, semantic verdicts annotation-
|
|
45
|
+
only.
|
|
46
|
+
- `--gate-threshold 70` reverts the threshold change.
|
|
47
|
+
- Combine both for full v0.1.1 default behavior:
|
|
48
|
+
```bash
|
|
49
|
+
node packages/a2ui/mcp/scripts/eval-diff.mjs --engine zettel \
|
|
50
|
+
--semantic --gate-mode structural
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Why the promotion is safe
|
|
54
|
+
|
|
55
|
+
Per the Phase 2 exit criterion ("no unexplained regressions in first
|
|
56
|
+
two full `eval-diff` runs"), zero pass-flips were observed at
|
|
57
|
+
threshold=70 (the v0.1.1 default). At threshold=80, the four
|
|
58
|
+
pass→fail flips are all `partial`-verdict items that fit the spec's
|
|
59
|
+
motivating failure mode (structural=86–88 + semantic=54–69) — the
|
|
60
|
+
intents the structural-only validator scored highly but the LLM
|
|
61
|
+
judge correctly flagged as "wrong UI for this intent":
|
|
62
|
+
|
|
63
|
+
- `user profile card with avatar` — structural 88, semantic 54
|
|
64
|
+
- `calendar month view` — structural 86, semantic 65
|
|
65
|
+
- `social media post card` — structural 86, semantic 69
|
|
66
|
+
- `user onboarding checklist` — structural 87, semantic 67
|
|
67
|
+
|
|
68
|
+
Pass rate drops from 83 → 79 (4 percentage points), all justified.
|
|
69
|
+
|
|
70
|
+
### Updated baseline thresholds
|
|
71
|
+
|
|
72
|
+
Per [`docs/specs/semantic-validator.md`](../../../docs/specs/semantic-validator.md)
|
|
73
|
+
§ Phase 2 status (2026-05-01):
|
|
74
|
+
|
|
75
|
+
- `avgScore ≥ 88` (structural — unchanged)
|
|
76
|
+
- `avgSemanticScore ≥ 85` (new)
|
|
77
|
+
- `avgCombined ≥ 87` (new)
|
|
78
|
+
- `passRate ≥ 78%` at combined-gate threshold=80 (new; was 83%
|
|
79
|
+
structural-only)
|
|
80
|
+
- `verdict aligned ≥ 84%` of judged items
|
|
81
|
+
|
|
82
|
+
### Implementation references
|
|
83
|
+
|
|
84
|
+
- [`scripts/eval-diff.mjs`](scripts/eval-diff.mjs) — default flip
|
|
85
|
+
- [`scripts/semantic-stats.mjs`](scripts/semantic-stats.mjs) —
|
|
86
|
+
unchanged from v0.1.1; remains the validation tool for future
|
|
87
|
+
threshold revalidation passes
|
|
88
|
+
|
|
14
89
|
## [0.1.1] - 2026-05-01
|
|
15
90
|
|
|
16
91
|
Phase 2 of [`docs/specs/semantic-validator.md`](../../../docs/specs/semantic-validator.md)
|
package/package.json
CHANGED
package/scripts/eval-diff.mjs
CHANGED
|
@@ -18,9 +18,9 @@
|
|
|
18
18
|
* node packages/a2ui/mcp/scripts/eval-diff.mjs --engine zettel # fragment-graph only
|
|
19
19
|
* node packages/a2ui/mcp/scripts/eval-diff.mjs --limit 20
|
|
20
20
|
* node packages/a2ui/mcp/scripts/eval-diff.mjs --domain forms
|
|
21
|
-
* node packages/a2ui/mcp/scripts/eval-diff.mjs --semantic # Phase
|
|
22
|
-
* node packages/a2ui/mcp/scripts/eval-diff.mjs --semantic --gate-mode
|
|
23
|
-
* node packages/a2ui/mcp/scripts/eval-diff.mjs --semantic --gate-
|
|
21
|
+
* node packages/a2ui/mcp/scripts/eval-diff.mjs --semantic # Phase 2 default: gates on combined score (>= 80)
|
|
22
|
+
* node packages/a2ui/mcp/scripts/eval-diff.mjs --semantic --gate-mode structural # Phase 1 shadow-mode (opt-back-in)
|
|
23
|
+
* node packages/a2ui/mcp/scripts/eval-diff.mjs --semantic --gate-threshold 75 # Override default threshold
|
|
24
24
|
*/
|
|
25
25
|
import '../../../../scripts/load-env.mjs';
|
|
26
26
|
|
|
@@ -48,13 +48,19 @@ const domain = opt('domain');
|
|
|
48
48
|
// Shadow-mode semantic validator (Phase 1). Opt-in; zero effect on gating
|
|
49
49
|
// when --gate-mode=structural (default).
|
|
50
50
|
const semanticEnabled = args.includes('--semantic');
|
|
51
|
-
// Phase 2 (gating mode):
|
|
52
|
-
//
|
|
53
|
-
//
|
|
54
|
-
//
|
|
55
|
-
//
|
|
56
|
-
|
|
57
|
-
|
|
51
|
+
// Phase 2 (gating mode). Default depends on --semantic:
|
|
52
|
+
// without --semantic → 'structural' (no semantic work; Phase 1 baseline).
|
|
53
|
+
// with --semantic → 'combined' since v0.1.2 (Phase 2 promotion); was
|
|
54
|
+
// 'structural' / shadow-mode in v0.1.1.
|
|
55
|
+
// Opt-back-in to shadow with --gate-mode structural.
|
|
56
|
+
// Combined mode gates on (0.6 * validationScore + 0.4 * semanticScore) at
|
|
57
|
+
// threshold 80 — chosen via the 2026-05-01 sweep (70 produced zero
|
|
58
|
+
// pass-flips; 80 catches the 4 partial-verdict cases the spec was designed
|
|
59
|
+
// to surface; 85 over-aggressively rejects aligned items).
|
|
60
|
+
// Override threshold with --gate-threshold N.
|
|
61
|
+
const gateModeDefault = args.includes('--semantic') ? 'combined' : 'structural';
|
|
62
|
+
const gateMode = opt('gate-mode') || gateModeDefault;
|
|
63
|
+
const gateThreshold = opt('gate-threshold') ? Number(opt('gate-threshold')) : 80;
|
|
58
64
|
if (!['structural', 'combined'].includes(gateMode)) {
|
|
59
65
|
console.error(`[eval-diff] --gate-mode must be one of: structural | combined (got: ${gateMode})`);
|
|
60
66
|
process.exit(2);
|