@adia-ai/a2ui-mcp 0.0.5 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +99 -0
- package/package.json +2 -2
- package/scripts/eval-refine-synthesis.mjs +270 -0
- package/scripts/smoke-issues.mjs +266 -0
- package/scripts/smoke-refine.mjs +374 -0
- package/scripts/smoke-state-cache.mjs +130 -0
- package/scripts/test-a2ui.mjs +103 -0
- package/server.js +309 -0
package/CHANGELOG.md
CHANGED
|
@@ -11,6 +11,105 @@ zettel strategies.
|
|
|
11
11
|
|
|
12
12
|
---
|
|
13
13
|
|
|
14
|
+
## [0.1.0] - 2026-04-28
|
|
15
|
+
|
|
16
|
+
**Multi-turn gen-UI tool surface (Phase A code-complete).** Adds three new
|
|
17
|
+
MCP tools that turn the chunk-composition pipeline from single-shot into a
|
|
18
|
+
multi-turn surface, plus extends `compose_from_chunks` to mint a `state_id`
|
|
19
|
+
for refinement chains.
|
|
20
|
+
|
|
21
|
+
Spec: [`docs/specs/genui-multiturn-architecture.md`](../../../docs/specs/genui-multiturn-architecture.md) (Active v0.1.0).
|
|
22
|
+
Plan: [`docs/plans/genui-multiturn-rollout-2026-04-28.md`](../../../docs/plans/genui-multiturn-rollout-2026-04-28.md) (Phase A scoped).
|
|
23
|
+
ADR: [`0008-multiturn-genui-architecture.md`](../../../.brain/adrs/0008-multiturn-genui-architecture.md).
|
|
24
|
+
|
|
25
|
+
### Added (MCP tools)
|
|
26
|
+
|
|
27
|
+
- **`refine_composition(state_id, intent | ops, max_attempts?)`** — takes a
|
|
28
|
+
`state_id` from a prior `compose_from_chunks` (or `refine_composition`)
|
|
29
|
+
call plus either a natural-language intent OR an explicit op-list, runs
|
|
30
|
+
the chunk-refiner's two-pass synthesis (locator → modifier; validator-
|
|
31
|
+
driven retry on op-validation failure), applies the resulting chunk-plan
|
|
32
|
+
ops, re-materializes HTML, mints a child `state_id` chained back to the
|
|
33
|
+
parent, and returns A2UI `updateComponents` messages (the wire format).
|
|
34
|
+
Failed ops surface in `ops_failed` with reasons; the new state is cached
|
|
35
|
+
for further refinement.
|
|
36
|
+
- **`get_state(state_id)`** — read-only inspection of a cached composition
|
|
37
|
+
state. Returns the chunk binding plan, materialized HTML, ops history
|
|
38
|
+
(chronological list of every refinement applied to this state's lineage),
|
|
39
|
+
and `parent_state_id` (chain-back). Auto-fires `cache-miss-on-known-state`
|
|
40
|
+
(severity `nit`) when the id is absent.
|
|
41
|
+
- **`report_issue(type, severity, title, body, state_id?, trace?, …)`** —
|
|
42
|
+
first-class telemetry / dev-process feedback tool. Writes a structured
|
|
43
|
+
JSON ticket to `.brain/audit-history/issues/<issue_id>.json`. Three
|
|
44
|
+
reporter kinds: LLM self-fire (this tool with `reporter: 'llm'`),
|
|
45
|
+
consumer-fire (passed through directly), engine auto-fire (internal,
|
|
46
|
+
per `AUTO_FIRE_POLICY` in the issue-reporter module). Severity vocabulary
|
|
47
|
+
`blocker | drift | nit` matches the existing `coherence-audit`
|
|
48
|
+
discipline. Trace levels: `'full' | 'summary' | 'none'`; oversized
|
|
49
|
+
traces (> 200 KB) spill to a sidecar `.trace.json` file. Tool count
|
|
50
|
+
goes from 25 → 28.
|
|
51
|
+
|
|
52
|
+
### Changed
|
|
53
|
+
|
|
54
|
+
- **`compose_from_chunks`** now mints a `state_id` and caches the result
|
|
55
|
+
before returning. The response shape gains a `state_id` field; existing
|
|
56
|
+
fields (`html`, `plan`, `source`, `score`, `warnings`, `synthesis`)
|
|
57
|
+
are unchanged. Backward-compatible — consumers ignoring `state_id` see
|
|
58
|
+
no behavior change.
|
|
59
|
+
- **MCP server boot** instantiates a `getStateCache()` singleton and an
|
|
60
|
+
`ENGINE_VERSION_INFO` block (mcp 0.1.0, corpus 0.0.6, engine zettel,
|
|
61
|
+
llm_adapter anthropic) that's threaded through every issue-reporter
|
|
62
|
+
call so written tickets carry environment metadata.
|
|
63
|
+
|
|
64
|
+
### Auto-fire policy (engine-driven)
|
|
65
|
+
|
|
66
|
+
`refine_composition` and `get_state` auto-fire `report_issue` on these
|
|
67
|
+
failure paths via the per-tool-call `IssueAccumulator`:
|
|
68
|
+
|
|
69
|
+
| Path | Type | Severity |
|
|
70
|
+
|---|---|---|
|
|
71
|
+
| Synthesizer exhausts retries | bug | drift |
|
|
72
|
+
| Validator exhausts retries on refinement | bug | blocker |
|
|
73
|
+
| Locator pass returns empty for targeted intent | bug | drift |
|
|
74
|
+
| Retrieval 0 + synthesis fallback fails | training-gap | drift |
|
|
75
|
+
| `get_state` called with absent `state_id` | bug | nit |
|
|
76
|
+
| `refine_composition` ops_failed list non-empty | bug | drift |
|
|
77
|
+
|
|
78
|
+
Multiple auto-fires within one tool call coalesce into a single issue
|
|
79
|
+
(highest severity wins; reasons listed in body + tags).
|
|
80
|
+
|
|
81
|
+
### Smoke + eval
|
|
82
|
+
|
|
83
|
+
- `smoke:state-cache` — 34/34.
|
|
84
|
+
- `smoke:issues` — 62/62.
|
|
85
|
+
- `smoke:refine` — 51/51 (stub LLM).
|
|
86
|
+
- `test:a2ui` — 25/25 + 1 skipped (was 19/19 + 1; +6 multi-turn assertions).
|
|
87
|
+
- `mcp:smoke` — server boots clean with 28 tools registered.
|
|
88
|
+
- **`eval:refine-synthesis`** — 15/15 PASS. Ops 100%, validate 100%,
|
|
89
|
+
0 auto-fires, 67 s.
|
|
90
|
+
- **No regression:** `eval:chunk-synthesis` 10/10, `eval:diff zettel`
|
|
91
|
+
coverage 83 / score 89 / MRR 0.986.
|
|
92
|
+
|
|
93
|
+
### Dependencies
|
|
94
|
+
|
|
95
|
+
- Bumps `@adia-ai/a2ui-compose` requirement from `^0.0.1` to `^0.1.0`.
|
|
96
|
+
|
|
97
|
+
### Migration
|
|
98
|
+
|
|
99
|
+
Additive surface; no breaking changes. The existing 25 tools are
|
|
100
|
+
unchanged behaviorally; `compose_from_chunks` adds a `state_id` field to
|
|
101
|
+
its response that ignoring consumers can safely drop.
|
|
102
|
+
|
|
103
|
+
### Phase A simplification (documented)
|
|
104
|
+
|
|
105
|
+
Refinement ops internally use a chunk-plan vocabulary
|
|
106
|
+
(`rebindSlot | appendToSlot | removeFromSlot | replacePage`), wrapped
|
|
107
|
+
on output as standard `updateComponents` A2UI messages with
|
|
108
|
+
`components[].html` carrying the materialized payload. Strict
|
|
109
|
+
component-tree shape upgrade is queued for Phase B.
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
14
113
|
## [0.0.5] - 2026-04-28
|
|
15
114
|
|
|
16
115
|
**Retires the legacy exemplar auto-ingest.** Server boot no longer pulls
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@adia-ai/a2ui-mcp",
|
|
3
|
-
"version": "0.0
|
|
3
|
+
"version": "0.1.0",
|
|
4
4
|
"description": "AdiaUI A2UI MCP server. Exposes the compose engine over MCP with an engine selector for monolithic + zettel strategies.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -26,7 +26,7 @@
|
|
|
26
26
|
},
|
|
27
27
|
"dependencies": {
|
|
28
28
|
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
29
|
-
"@adia-ai/a2ui-compose": "^0.0
|
|
29
|
+
"@adia-ai/a2ui-compose": "^0.1.0",
|
|
30
30
|
"@adia-ai/a2ui-retrieval": "^0.0.1",
|
|
31
31
|
"@adia-ai/a2ui-validator": "^0.0.1",
|
|
32
32
|
"@adia-ai/a2ui-corpus": "^0.0.6",
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Real-LLM eval set for the chunk-refiner — multi-turn refinement engine.
|
|
4
|
+
*
|
|
5
|
+
* Walks 5 seed compositions × 3 refinement intents = 15 total. Seeds are
|
|
6
|
+
* deterministic chunk-binding plans (no LLM cost on the create side); the
|
|
7
|
+
* refiner exercises the full two-pass synthesis (locator → modifier),
|
|
8
|
+
* validator-driven retry, and op-application path.
|
|
9
|
+
*
|
|
10
|
+
* Pass criteria (per spec §6.2 + plan §1.7):
|
|
11
|
+
* - ≥ 80% of refinements produce ops (no all-fail outcome).
|
|
12
|
+
* - ≥ 90% of returned ops apply cleanly (validator + applyOps).
|
|
13
|
+
* - ≤ 5 auto-fired issues across the full run (plan §1.8 #3).
|
|
14
|
+
*
|
|
15
|
+
* Spec: docs/specs/genui-multiturn-architecture.md (Active v0.1.0).
|
|
16
|
+
* Plan: docs/plans/genui-multiturn-rollout-2026-04-28.md (Phase A).
|
|
17
|
+
*
|
|
18
|
+
* Usage:
|
|
19
|
+
* ANTHROPIC_API_KEY=… node packages/a2ui/mcp/scripts/eval-refine-synthesis.mjs
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import '../../../../scripts/load-env.mjs';
|
|
23
|
+
import {
|
|
24
|
+
refineFromIntent,
|
|
25
|
+
applyOps,
|
|
26
|
+
} from '../../compose/engines/zettel/chunk-refiner.js';
|
|
27
|
+
import { mintStateId } from '../../compose/engines/zettel/state-cache.js';
|
|
28
|
+
import { createIssueAccumulator } from '../../compose/engines/zettel/issue-reporter.js';
|
|
29
|
+
import { composeFromPlan } from '../../compose/engines/zettel/chunk-composer.js';
|
|
30
|
+
import { listChunksByKind, getChunk } from '../../corpus/scripts/chunk-library.js';
|
|
31
|
+
import { createAdapter } from '../../compose/llm/llm-bridge.js';
|
|
32
|
+
|
|
33
|
+
// ── Discover corpus shape ────────────────────────────────────────────
|
|
34
|
+
// Pick a page with ≥ 2 slots so refinements have room to target.
|
|
35
|
+
|
|
36
|
+
const pages = listChunksByKind('page');
|
|
37
|
+
const panels = listChunksByKind('panel');
|
|
38
|
+
const blocks = listChunksByKind('block');
|
|
39
|
+
|
|
40
|
+
const slotsOf = (c) => (c.slots || c.instances?.[0]?.slots || []).map((s) => s.name);
|
|
41
|
+
|
|
42
|
+
const samplePage =
|
|
43
|
+
pages.find((p) => slotsOf(p).length >= 2)
|
|
44
|
+
|| panels.find((p) => slotsOf(p).length >= 2)
|
|
45
|
+
|| pages[0]
|
|
46
|
+
|| panels[0];
|
|
47
|
+
|
|
48
|
+
if (!samplePage || slotsOf(samplePage).length === 0) {
|
|
49
|
+
console.error('Corpus has no page/panel chunks with declared slots — aborting eval');
|
|
50
|
+
process.exit(2);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const pageSlots = slotsOf(samplePage);
|
|
54
|
+
const slotA = pageSlots[0]; // typically the header slot
|
|
55
|
+
const slotB = pageSlots[1] || pageSlots[0]; // typically the content slot
|
|
56
|
+
|
|
57
|
+
// Pick at least 4 distinct block chunks. Filter out anything missing HTML.
|
|
58
|
+
const usableBlocks = blocks
|
|
59
|
+
.filter((b) => (b.html || b.instances?.[0]?.html))
|
|
60
|
+
.slice(0, 8)
|
|
61
|
+
.map((b) => b.name);
|
|
62
|
+
|
|
63
|
+
if (usableBlocks.length < 4) {
|
|
64
|
+
console.error(`Corpus has only ${usableBlocks.length} usable block chunks (need ≥ 4) — aborting eval`);
|
|
65
|
+
process.exit(2);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
const [B0, B1, B2, B3, B4 = B0, B5 = B1] = usableBlocks;
|
|
69
|
+
|
|
70
|
+
console.log(`▶ refine-synthesis eval`);
|
|
71
|
+
console.log(` page: ${samplePage.name} (${pageSlots.join(', ')})`);
|
|
72
|
+
console.log(` blocks: ${usableBlocks.slice(0, 6).join(', ')}`);
|
|
73
|
+
console.log('');
|
|
74
|
+
|
|
75
|
+
// ── Seeds + refinements ──────────────────────────────────────────────
|
|
76
|
+
// 5 seeds × 3 refinements = 15 hold-out intents.
|
|
77
|
+
|
|
78
|
+
const SEEDS = [
|
|
79
|
+
{
|
|
80
|
+
label: 'two-block-content',
|
|
81
|
+
plan: {
|
|
82
|
+
page: samplePage.name,
|
|
83
|
+
slot_bindings: { [slotA]: [B0], [slotB]: [B1, B2] },
|
|
84
|
+
},
|
|
85
|
+
refinements: [
|
|
86
|
+
`add another block to ${slotB}`,
|
|
87
|
+
`remove one block from ${slotB}`,
|
|
88
|
+
`swap the ${slotA} for a different option`,
|
|
89
|
+
],
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
label: 'single-block-content',
|
|
93
|
+
plan: {
|
|
94
|
+
page: samplePage.name,
|
|
95
|
+
slot_bindings: { [slotA]: [B0], [slotB]: [B1] },
|
|
96
|
+
},
|
|
97
|
+
refinements: [
|
|
98
|
+
`add a second block alongside the existing one in ${slotB}`,
|
|
99
|
+
`replace the ${slotA} with a more concise header`,
|
|
100
|
+
`preserve the existing block and append another to ${slotB}`,
|
|
101
|
+
],
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
label: 'three-block-stack',
|
|
105
|
+
plan: {
|
|
106
|
+
page: samplePage.name,
|
|
107
|
+
slot_bindings: { [slotA]: [B0], [slotB]: [B1, B2, B3] },
|
|
108
|
+
},
|
|
109
|
+
refinements: [
|
|
110
|
+
`remove the middle block from ${slotB}`,
|
|
111
|
+
`drop the last block from ${slotB}`,
|
|
112
|
+
`make the layout more compact`,
|
|
113
|
+
],
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
label: 'header-only',
|
|
117
|
+
plan: {
|
|
118
|
+
page: samplePage.name,
|
|
119
|
+
slot_bindings: { [slotA]: [B0], [slotB]: [B1] },
|
|
120
|
+
},
|
|
121
|
+
refinements: [
|
|
122
|
+
`add an additional block to ${slotB}`,
|
|
123
|
+
`change the ${slotA}`,
|
|
124
|
+
`preserve everything and add a new block to ${slotB}`,
|
|
125
|
+
],
|
|
126
|
+
},
|
|
127
|
+
{
|
|
128
|
+
label: 'mixed-stack',
|
|
129
|
+
plan: {
|
|
130
|
+
page: samplePage.name,
|
|
131
|
+
slot_bindings: { [slotA]: [B0], [slotB]: [B2, B3] },
|
|
132
|
+
},
|
|
133
|
+
refinements: [
|
|
134
|
+
`swap the first block in ${slotB} for a different one`,
|
|
135
|
+
`add another block at the end of ${slotB}`,
|
|
136
|
+
`drop the last block from ${slotB}`,
|
|
137
|
+
],
|
|
138
|
+
},
|
|
139
|
+
];
|
|
140
|
+
|
|
141
|
+
// ── Run ──────────────────────────────────────────────────────────────
|
|
142
|
+
|
|
143
|
+
const llmAdapter = await createAdapter();
|
|
144
|
+
const startedAt = Date.now();
|
|
145
|
+
const results = [];
|
|
146
|
+
const autoFires = { total: 0, byReason: {} };
|
|
147
|
+
|
|
148
|
+
for (const seed of SEEDS) {
|
|
149
|
+
// Materialize the seed plan (no LLM call; pure compose-from-plan).
|
|
150
|
+
const composed = composeFromPlan(seed.plan);
|
|
151
|
+
if (!composed.html) {
|
|
152
|
+
console.log(`✗ seed [${seed.label}] failed to materialize — skipping refinements`);
|
|
153
|
+
for (const intent of seed.refinements) {
|
|
154
|
+
results.push({
|
|
155
|
+
seed: seed.label, intent, ms: 0, ops_count: 0, attempts: 0,
|
|
156
|
+
targeted: null, ops_applied: 0, ops_failed: 0,
|
|
157
|
+
auto_fires: [], error: 'seed-materialize-failed',
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
continue;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
const priorState = {
|
|
164
|
+
state_id: mintStateId(seed.label, 1),
|
|
165
|
+
intent: `[seed] ${seed.label}`,
|
|
166
|
+
plan: seed.plan,
|
|
167
|
+
html: composed.html,
|
|
168
|
+
version: 1,
|
|
169
|
+
};
|
|
170
|
+
|
|
171
|
+
console.log(`── seed [${seed.label}] · ${seed.plan.slot_bindings[slotB].length} block(s) in ${slotB}`);
|
|
172
|
+
|
|
173
|
+
for (const intent of seed.refinements) {
|
|
174
|
+
const acc = createIssueAccumulator();
|
|
175
|
+
const t0 = Date.now();
|
|
176
|
+
const row = {
|
|
177
|
+
seed: seed.label, intent, ms: 0, ops_count: 0, attempts: 0,
|
|
178
|
+
targeted: null, ops_applied: 0, ops_failed: 0,
|
|
179
|
+
auto_fires: [], error: null,
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
try {
|
|
183
|
+
const refined = await refineFromIntent({
|
|
184
|
+
priorState,
|
|
185
|
+
intent,
|
|
186
|
+
llmAdapter,
|
|
187
|
+
maxAttempts: 2,
|
|
188
|
+
issueAccumulator: acc,
|
|
189
|
+
});
|
|
190
|
+
row.ms = Date.now() - t0;
|
|
191
|
+
row.ops_count = refined.ops.length;
|
|
192
|
+
row.attempts = refined.synthesis?.attempts ?? 0;
|
|
193
|
+
row.targeted = refined.synthesis?.targeted ?? null;
|
|
194
|
+
|
|
195
|
+
if (refined.ops.length > 0) {
|
|
196
|
+
const applied = await applyOps({ priorState, ops: refined.ops });
|
|
197
|
+
row.ops_applied = applied.ops_applied.length;
|
|
198
|
+
row.ops_failed = applied.ops_failed.length;
|
|
199
|
+
}
|
|
200
|
+
} catch (e) {
|
|
201
|
+
row.ms = Date.now() - t0;
|
|
202
|
+
row.error = e.message;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
row.auto_fires = acc.reasons();
|
|
206
|
+
autoFires.total += row.auto_fires.length;
|
|
207
|
+
for (const r of row.auto_fires) {
|
|
208
|
+
autoFires.byReason[r] = (autoFires.byReason[r] || 0) + 1;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
results.push(row);
|
|
212
|
+
|
|
213
|
+
const flag = row.ops_count > 0 && row.ops_failed === 0 ? '✓' : (row.ops_count > 0 ? '~' : '✗');
|
|
214
|
+
const tgtTag = row.targeted === true ? 'tgt' : row.targeted === false ? 'unt' : '???';
|
|
215
|
+
const padMs = row.ms.toString().padStart(5);
|
|
216
|
+
console.log(` ${flag} [${tgtTag}] ${padMs}ms ops=${row.ops_count} att=${row.attempts} ${intent}`);
|
|
217
|
+
if (row.error) console.log(` error: ${row.error}`);
|
|
218
|
+
if (row.ops_failed > 0) console.log(` ops_failed: ${row.ops_failed}`);
|
|
219
|
+
if (row.auto_fires.length) console.log(` auto-fires: ${row.auto_fires.join(', ')}`);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// ── Summary ──────────────────────────────────────────────────────────
|
|
224
|
+
|
|
225
|
+
const total = results.length;
|
|
226
|
+
const producedOps = results.filter((r) => r.ops_count > 0).length;
|
|
227
|
+
const totalOpsReturned = results.reduce((s, r) => s + r.ops_count, 0);
|
|
228
|
+
const totalOpsApplied = results.reduce((s, r) => s + r.ops_applied, 0);
|
|
229
|
+
const totalOpsFailed = results.reduce((s, r) => s + r.ops_failed, 0);
|
|
230
|
+
|
|
231
|
+
const opsRate = total ? producedOps / total : 0;
|
|
232
|
+
const validateRate = totalOpsReturned ? totalOpsApplied / totalOpsReturned : 0;
|
|
233
|
+
|
|
234
|
+
console.log(`\n── Summary ──`);
|
|
235
|
+
console.log(` Refinements: ${total}`);
|
|
236
|
+
console.log(` Produced ops: ${producedOps}/${total} (${(opsRate * 100).toFixed(0)}%)`);
|
|
237
|
+
console.log(` Ops returned: ${totalOpsReturned}; applied: ${totalOpsApplied}; failed: ${totalOpsFailed} (validate ${(validateRate * 100).toFixed(0)}%)`);
|
|
238
|
+
console.log(` Auto-fires: ${autoFires.total}`);
|
|
239
|
+
if (autoFires.total > 0) {
|
|
240
|
+
for (const [reason, n] of Object.entries(autoFires.byReason)) {
|
|
241
|
+
console.log(` ${reason}: ${n}`);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
const targeted = results.filter((r) => r.targeted === true).length;
|
|
245
|
+
const untargeted = results.filter((r) => r.targeted === false).length;
|
|
246
|
+
console.log(` Targeted vs untargeted: ${targeted} / ${untargeted}`);
|
|
247
|
+
console.log(` Total time: ${((Date.now() - startedAt) / 1000).toFixed(1)}s`);
|
|
248
|
+
|
|
249
|
+
const opsThreshold = 0.8;
|
|
250
|
+
const validateThreshold = 0.9;
|
|
251
|
+
const autoFireCeiling = 5;
|
|
252
|
+
|
|
253
|
+
const opsPass = opsRate >= opsThreshold;
|
|
254
|
+
const validatePass = totalOpsReturned === 0 || validateRate >= validateThreshold;
|
|
255
|
+
const autoFirePass = autoFires.total <= autoFireCeiling;
|
|
256
|
+
|
|
257
|
+
const allPass = opsPass && validatePass && autoFirePass;
|
|
258
|
+
|
|
259
|
+
console.log('');
|
|
260
|
+
console.log(` ops rate ≥ ${opsThreshold * 100}%: ${opsPass ? '✓' : '✗'} (${(opsRate * 100).toFixed(0)}%)`);
|
|
261
|
+
console.log(` validate ≥ ${validateThreshold * 100}%: ${validatePass ? '✓' : '✗'} (${(validateRate * 100).toFixed(0)}%)`);
|
|
262
|
+
console.log(` auto-fires ≤ ${autoFireCeiling}: ${autoFirePass ? '✓' : '✗'} (${autoFires.total})`);
|
|
263
|
+
|
|
264
|
+
if (allPass) {
|
|
265
|
+
console.log(`\n✓ PASS`);
|
|
266
|
+
process.exit(0);
|
|
267
|
+
} else {
|
|
268
|
+
console.log(`\n✗ FAIL`);
|
|
269
|
+
process.exit(1);
|
|
270
|
+
}
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// Smoke test: issue-reporter (write, trace attach, auto-fire, coalesce, evalMode).
|
|
3
|
+
// Spec: docs/specs/genui-multiturn-architecture.md §3.5 + §4.6 + §6.4 + §11.
|
|
4
|
+
|
|
5
|
+
import {
|
|
6
|
+
reportIssue,
|
|
7
|
+
autoReport,
|
|
8
|
+
attachTrace,
|
|
9
|
+
createIssueAccumulator,
|
|
10
|
+
AUTO_FIRE_POLICY,
|
|
11
|
+
} from '../../compose/engines/zettel/issue-reporter.js';
|
|
12
|
+
import { StateCache } from '../../compose/engines/zettel/state-cache.js';
|
|
13
|
+
import { mkdtemp, readFile, rm, stat } from 'node:fs/promises';
|
|
14
|
+
import { tmpdir } from 'node:os';
|
|
15
|
+
import { join } from 'node:path';
|
|
16
|
+
|
|
17
|
+
let pass = 0, fail = 0;
|
|
18
|
+
const t = (label, ok, detail = '') => {
|
|
19
|
+
if (ok) { console.log(` ✓ ${label}`); pass++; }
|
|
20
|
+
else { console.log(` ✗ ${label} ${detail}`); fail++; }
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
const TMP = await mkdtemp(join(tmpdir(), 'a2ui-issues-'));
|
|
24
|
+
const ctx = {
|
|
25
|
+
storageRoot: TMP,
|
|
26
|
+
versionInfo: { mcp: '0.1.0', corpus: '0.0.6', engine: 'zettel', llm_adapter: 'stub', model: 'test' },
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
console.log(`Storage root: ${TMP}`);
|
|
30
|
+
console.log('\n=== reportIssue: basic write ===');
|
|
31
|
+
|
|
32
|
+
const r1 = await reportIssue({
|
|
33
|
+
type: 'bug',
|
|
34
|
+
severity: 'drift',
|
|
35
|
+
title: 'Test issue with simple title',
|
|
36
|
+
body: 'Test body content.',
|
|
37
|
+
}, { ...ctx, reporter: 'user' });
|
|
38
|
+
|
|
39
|
+
t('returns issue_id', !!r1.issue_id);
|
|
40
|
+
t('issue-id format YYYY-MM-DD-slug-rand4', /^\d{4}-\d{2}-\d{2}-[a-z0-9-]+-[a-f0-9]{4}$/.test(r1.issue_id));
|
|
41
|
+
t('returns ack: logged', r1.ack === 'logged');
|
|
42
|
+
t('returns absolute path', r1.path.startsWith(TMP));
|
|
43
|
+
|
|
44
|
+
const file1 = JSON.parse(await readFile(r1.path, 'utf8'));
|
|
45
|
+
t('written file has expected type', file1.type === 'bug');
|
|
46
|
+
t('written file has expected severity', file1.severity === 'drift');
|
|
47
|
+
t('written file has status open', file1.status === 'open');
|
|
48
|
+
t('written file has reporter.kind', file1.reporter.kind === 'user');
|
|
49
|
+
t('written file has environment', file1.environment?.mcp === '0.1.0');
|
|
50
|
+
t('written file has linked_specs default', Array.isArray(file1.linked_specs) && file1.linked_specs.length > 0);
|
|
51
|
+
t('related_issue_ids defaults to empty array', Array.isArray(file1.related_issue_ids) && file1.related_issue_ids.length === 0);
|
|
52
|
+
t('tags defaults to empty array', Array.isArray(file1.tags));
|
|
53
|
+
t('suggested_owner defaults to "unknown"', file1.suggested_owner === 'unknown');
|
|
54
|
+
|
|
55
|
+
console.log('\n=== validation guards ===');
|
|
56
|
+
|
|
57
|
+
let threw = false;
|
|
58
|
+
try { await reportIssue({ type: 'invalid', severity: 'drift', title: 't', body: 'b' }, ctx); }
|
|
59
|
+
catch (e) { threw = /type must be/.test(e.message); }
|
|
60
|
+
t('rejects invalid type', threw);
|
|
61
|
+
|
|
62
|
+
threw = false;
|
|
63
|
+
try { await reportIssue({ type: 'bug', severity: 'oops', title: 't', body: 'b' }, ctx); }
|
|
64
|
+
catch (e) { threw = /severity must be/.test(e.message); }
|
|
65
|
+
t('rejects invalid severity', threw);
|
|
66
|
+
|
|
67
|
+
threw = false;
|
|
68
|
+
try { await reportIssue({ type: 'bug', severity: 'drift', title: 'x'.repeat(81), body: 'b' }, ctx); }
|
|
69
|
+
catch (e) { threw = /≤ 80 chars/.test(e.message); }
|
|
70
|
+
t('rejects title > 80 chars', threw);
|
|
71
|
+
|
|
72
|
+
threw = false;
|
|
73
|
+
try { await reportIssue({ type: 'bug', severity: 'drift', title: 't', body: 'b', trace: 'partial' }, ctx); }
|
|
74
|
+
catch (e) { threw = /trace must be/.test(e.message); }
|
|
75
|
+
t('rejects invalid trace depth', threw);
|
|
76
|
+
|
|
77
|
+
threw = false;
|
|
78
|
+
try { await reportIssue({ type: 'bug', severity: 'drift', title: 't', body: 'b', tags: 'not-an-array' }, ctx); }
|
|
79
|
+
catch (e) { threw = /tags must be an array/.test(e.message); }
|
|
80
|
+
t('rejects non-array tags', threw);
|
|
81
|
+
|
|
82
|
+
console.log('\n=== state_id trace attachment ===');
|
|
83
|
+
|
|
84
|
+
const cache = new StateCache({ maxSize: 10 });
|
|
85
|
+
cache.set('dash-3f9a-v1-26042817', {
|
|
86
|
+
state_id: 'dash-3f9a-v1-26042817',
|
|
87
|
+
intent: 'admin dashboard',
|
|
88
|
+
tool: 'compose_from_chunks',
|
|
89
|
+
input: { intent: 'admin dashboard' },
|
|
90
|
+
output: { html: '<dashboard/>', plan: { page: 'dashboard-admin-page' } },
|
|
91
|
+
ops_history: [
|
|
92
|
+
{ type: 'createSurface', surfaceId: 'main' },
|
|
93
|
+
{ type: 'updateComponents', surfaceId: 'main', components: [] },
|
|
94
|
+
],
|
|
95
|
+
delta_summary: 'Created admin dashboard',
|
|
96
|
+
warnings: [],
|
|
97
|
+
duration_ms: 1234,
|
|
98
|
+
internal: {
|
|
99
|
+
locator_prompt: 'PROMPT_LOCATOR_v1',
|
|
100
|
+
locator_response: 'RESPONSE_LOCATOR_v1',
|
|
101
|
+
modifier_prompt: 'PROMPT_MODIFIER_v1',
|
|
102
|
+
modifier_response: 'RESPONSE_MODIFIER_v1',
|
|
103
|
+
validator_results: [{ ok: true }],
|
|
104
|
+
retries: 0,
|
|
105
|
+
},
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
const traceSummary = await attachTrace('dash-3f9a-v1-26042817', 'summary', cache);
|
|
109
|
+
t('summary trace populates state_id', traceSummary?.state_id === 'dash-3f9a-v1-26042817');
|
|
110
|
+
t('summary trace populates input', traceSummary?.input?.intent === 'admin dashboard');
|
|
111
|
+
t('summary trace populates output.ops', Array.isArray(traceSummary?.output?.ops));
|
|
112
|
+
t('summary trace omits internal field', traceSummary?.internal === undefined);
|
|
113
|
+
|
|
114
|
+
const traceFull = await attachTrace('dash-3f9a-v1-26042817', 'full', cache);
|
|
115
|
+
t('full trace includes internal.locator_prompt', traceFull?.internal?.locator_prompt === 'PROMPT_LOCATOR_v1');
|
|
116
|
+
t('full trace includes internal.modifier_response', traceFull?.internal?.modifier_response === 'RESPONSE_MODIFIER_v1');
|
|
117
|
+
t('full trace includes validator_results', Array.isArray(traceFull?.internal?.validator_results));
|
|
118
|
+
|
|
119
|
+
const traceMiss = await attachTrace('not-a-real-id', 'summary', cache);
|
|
120
|
+
t('attachTrace returns null on cache miss', traceMiss === null);
|
|
121
|
+
|
|
122
|
+
const traceNoCache = await attachTrace('dash-3f9a-v1-26042817', 'summary', null);
|
|
123
|
+
t('attachTrace returns null when no cache', traceNoCache === null);
|
|
124
|
+
|
|
125
|
+
// reportIssue with state_id integrates trace
|
|
126
|
+
const r2 = await reportIssue({
|
|
127
|
+
type: 'bug',
|
|
128
|
+
severity: 'drift',
|
|
129
|
+
title: 'Issue tied to a state',
|
|
130
|
+
body: 'Reproducible on dashboard generation.',
|
|
131
|
+
state_id: 'dash-3f9a-v1-26042817',
|
|
132
|
+
trace: 'full',
|
|
133
|
+
}, { ...ctx, cache });
|
|
134
|
+
const file2 = JSON.parse(await readFile(r2.path, 'utf8'));
|
|
135
|
+
t('reportIssue with state_id+full attaches trace.input', file2.trace?.input?.intent === 'admin dashboard');
|
|
136
|
+
t('reportIssue with full trace attaches internal', file2.trace?.internal?.locator_prompt === 'PROMPT_LOCATOR_v1');
|
|
137
|
+
|
|
138
|
+
// peek does not touch recency — verify the cache state didn't bump dash- to most-recent.
|
|
139
|
+
// Insert a few more entries, ensure dash- stays the LRU candidate.
|
|
140
|
+
// (This is exercised indirectly: state-cache smoke covers peek-recency directly.)
|
|
141
|
+
|
|
142
|
+
console.log('\n=== oversized trace spills to sidecar ===');
|
|
143
|
+
|
|
144
|
+
cache.set('big-state', {
|
|
145
|
+
state_id: 'big-state',
|
|
146
|
+
intent: 'big',
|
|
147
|
+
tool: 'compose',
|
|
148
|
+
input: {},
|
|
149
|
+
ops_history: [],
|
|
150
|
+
internal: { huge_dump: 'x'.repeat(300 * 1024) },
|
|
151
|
+
});
|
|
152
|
+
const r3 = await reportIssue({
|
|
153
|
+
type: 'bug',
|
|
154
|
+
severity: 'drift',
|
|
155
|
+
title: 'Big trace test',
|
|
156
|
+
body: 'Has an oversized trace.',
|
|
157
|
+
state_id: 'big-state',
|
|
158
|
+
trace: 'full',
|
|
159
|
+
}, { ...ctx, cache });
|
|
160
|
+
const file3 = JSON.parse(await readFile(r3.path, 'utf8'));
|
|
161
|
+
t('oversized trace replaced by sidecar pointer', !!file3.trace?.sidecar);
|
|
162
|
+
t('sidecar path has expected shape', /^traces\/.+\.trace\.json$/.test(file3.trace.sidecar));
|
|
163
|
+
const sidecar = await stat(join(TMP, file3.trace.sidecar));
|
|
164
|
+
t('sidecar file exists with non-zero size', sidecar.size > 0);
|
|
165
|
+
|
|
166
|
+
console.log('\n=== autoReport: policy lookup ===');
|
|
167
|
+
|
|
168
|
+
const a1 = await autoReport('validator-exhausted', { tool: 'refine_composition' }, ctx);
|
|
169
|
+
const file_a1 = JSON.parse(await readFile(a1.path, 'utf8'));
|
|
170
|
+
t('validator-exhausted: type=bug', file_a1.type === 'bug');
|
|
171
|
+
t('validator-exhausted: severity=blocker', file_a1.severity === 'blocker');
|
|
172
|
+
t('validator-exhausted: suggested_owner=validator', file_a1.suggested_owner === 'validator');
|
|
173
|
+
t('validator-exhausted: reporter.kind=auto', file_a1.reporter.kind === 'auto');
|
|
174
|
+
t('validator-exhausted: reporter.context=validator-exhausted', file_a1.reporter.context === 'validator-exhausted');
|
|
175
|
+
t('validator-exhausted: tags include "auto-fire"', file_a1.tags.includes('auto-fire'));
|
|
176
|
+
|
|
177
|
+
const a2 = await autoReport('retrieval-zero-then-synthesis-fail', { intent: 'pricing page' }, ctx);
|
|
178
|
+
const file_a2 = JSON.parse(await readFile(a2.path, 'utf8'));
|
|
179
|
+
t('retrieval-zero-then-synthesis-fail: type=training-gap', file_a2.type === 'training-gap');
|
|
180
|
+
t('retrieval-zero-then-synthesis-fail: suggested_owner=chunk-corpus', file_a2.suggested_owner === 'chunk-corpus');
|
|
181
|
+
t('retrieval-zero-then-synthesis-fail: title carries intent', /pricing page/.test(file_a2.title));
|
|
182
|
+
|
|
183
|
+
const a3 = await autoReport('cache-miss-on-known-state', { state_id: 'gone-12ab-v1-0' }, ctx);
|
|
184
|
+
const file_a3 = JSON.parse(await readFile(a3.path, 'utf8'));
|
|
185
|
+
t('cache-miss-on-known-state: severity=nit', file_a3.severity === 'nit');
|
|
186
|
+
|
|
187
|
+
threw = false;
|
|
188
|
+
try { await autoReport('unknown-reason', {}, ctx); }
|
|
189
|
+
catch (e) { threw = /unknown reason/.test(e.message); }
|
|
190
|
+
t('autoReport rejects unknown reason', threw);
|
|
191
|
+
|
|
192
|
+
console.log('\n=== evalMode suppresses auto-fire ===');
|
|
193
|
+
|
|
194
|
+
const evalCtx = { ...ctx, evalMode: true };
|
|
195
|
+
const aSuppressed = await autoReport('validator-exhausted', { tool: 'refine_composition' }, evalCtx);
|
|
196
|
+
t('autoReport returns null when evalMode=true', aSuppressed === null);
|
|
197
|
+
|
|
198
|
+
// Manual reportIssue still writes during evalMode (eval-suppression is auto-fire only)
|
|
199
|
+
const aManual = await reportIssue({
|
|
200
|
+
type: 'bug',
|
|
201
|
+
severity: 'blocker',
|
|
202
|
+
title: 'Manual call during evalMode',
|
|
203
|
+
body: 'should still write',
|
|
204
|
+
}, evalCtx);
|
|
205
|
+
t('manual reportIssue ignores evalMode', !!aManual.issue_id);
|
|
206
|
+
const file_manual = JSON.parse(await readFile(aManual.path, 'utf8'));
|
|
207
|
+
t('manual call during evalMode writes file', file_manual.title === 'Manual call during evalMode');
|
|
208
|
+
|
|
209
|
+
console.log('\n=== coalescing accumulator ===');
|
|
210
|
+
|
|
211
|
+
const acc = createIssueAccumulator();
|
|
212
|
+
t('empty accumulator size 0', acc.size() === 0);
|
|
213
|
+
const flushEmpty = await acc.flush(ctx);
|
|
214
|
+
t('empty accumulator flush returns null', flushEmpty === null);
|
|
215
|
+
|
|
216
|
+
acc.add('locator-empty-targets', { intent: 'change title' });
|
|
217
|
+
t('single-entry accumulator size 1', acc.size() === 1);
|
|
218
|
+
const flushOne = await acc.flush(ctx);
|
|
219
|
+
const file_flush_one = JSON.parse(await readFile(flushOne.path, 'utf8'));
|
|
220
|
+
t('single-entry flush writes normal auto-issue', file_flush_one.reporter.kind === 'auto' && file_flush_one.reporter.context === 'locator-empty-targets');
|
|
221
|
+
t('single-entry flush resets accumulator', acc.size() === 0);
|
|
222
|
+
|
|
223
|
+
acc.add('locator-empty-targets', { intent: 'change title' });
|
|
224
|
+
acc.add('validator-exhausted', { tool: 'refine_composition' });
|
|
225
|
+
acc.add('ops-failed-after-apply', {});
|
|
226
|
+
t('three-entry accumulator size 3', acc.size() === 3);
|
|
227
|
+
const flushThree = await acc.flush(ctx);
|
|
228
|
+
const file_flush_three = JSON.parse(await readFile(flushThree.path, 'utf8'));
|
|
229
|
+
t('coalesced issue: severity=blocker (highest of three)', file_flush_three.severity === 'blocker');
|
|
230
|
+
t('coalesced issue: type=bug', file_flush_three.type === 'bug');
|
|
231
|
+
t('coalesced issue: reporter.context=coalesced', file_flush_three.reporter.context === 'coalesced');
|
|
232
|
+
t('coalesced issue: tags include "coalesced"', file_flush_three.tags.includes('coalesced'));
|
|
233
|
+
t('coalesced issue: tags include all reasons',
|
|
234
|
+
['locator-empty-targets', 'validator-exhausted', 'ops-failed-after-apply'].every((r) => file_flush_three.tags.includes(r))
|
|
235
|
+
);
|
|
236
|
+
t('coalesced issue: body lists every reason',
|
|
237
|
+
file_flush_three.body.includes('locator-empty-targets') &&
|
|
238
|
+
file_flush_three.body.includes('validator-exhausted') &&
|
|
239
|
+
file_flush_three.body.includes('ops-failed-after-apply')
|
|
240
|
+
);
|
|
241
|
+
t('coalesced flush resets accumulator', acc.size() === 0);
|
|
242
|
+
|
|
243
|
+
// evalMode + coalesce → no write
|
|
244
|
+
const accEval = createIssueAccumulator();
|
|
245
|
+
accEval.add('validator-exhausted', {});
|
|
246
|
+
accEval.add('ops-failed-after-apply', {});
|
|
247
|
+
const flushEval = await accEval.flush({ ...ctx, evalMode: true });
|
|
248
|
+
t('coalesce flush returns null when evalMode=true', flushEval === null);
|
|
249
|
+
|
|
250
|
+
threw = false;
|
|
251
|
+
try { acc.add('not-a-real-reason'); }
|
|
252
|
+
catch (e) { threw = /unknown reason/.test(e.message); }
|
|
253
|
+
t('accumulator.add rejects unknown reason', threw);
|
|
254
|
+
|
|
255
|
+
console.log('\n=== AUTO_FIRE_POLICY exported ===');
|
|
256
|
+
|
|
257
|
+
t('AUTO_FIRE_POLICY exports expected reasons',
|
|
258
|
+
['synthesizer-exhausted', 'validator-exhausted', 'locator-empty-targets',
|
|
259
|
+
'retrieval-zero-then-synthesis-fail', 'cache-miss-on-known-state',
|
|
260
|
+
'ops-failed-after-apply'].every((r) => AUTO_FIRE_POLICY[r])
|
|
261
|
+
);
|
|
262
|
+
|
|
263
|
+
await rm(TMP, { recursive: true, force: true });
|
|
264
|
+
|
|
265
|
+
console.log(`\n${pass} passed, ${fail} failed`);
|
|
266
|
+
process.exit(fail ? 1 : 0);
|