@hover-dev/core 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -2
- package/dist/agents/aider.d.ts +16 -0
- package/dist/agents/aider.d.ts.map +1 -0
- package/dist/agents/aider.js +169 -0
- package/dist/agents/gemini.d.ts +17 -0
- package/dist/agents/gemini.d.ts.map +1 -0
- package/dist/agents/gemini.js +197 -0
- package/dist/agents/qwen.d.ts +17 -0
- package/dist/agents/qwen.d.ts.map +1 -0
- package/dist/agents/qwen.js +183 -0
- package/dist/agents/registry.d.ts +7 -5
- package/dist/agents/registry.d.ts.map +1 -1
- package/dist/agents/registry.js +13 -5
- package/dist/scripts/bench-multi-tab.d.ts +2 -0
- package/dist/scripts/bench-multi-tab.d.ts.map +1 -0
- package/dist/scripts/bench-multi-tab.js +192 -0
- package/dist/service/cdpHint.d.ts.map +1 -1
- package/dist/service/cdpHint.js +39 -0
- package/dist/service.d.ts.map +1 -1
- package/dist/service.js +92 -1
- package/dist/specs/listSpecs.d.ts +40 -0
- package/dist/specs/listSpecs.d.ts.map +1 -0
- package/dist/specs/listSpecs.js +114 -0
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -17,7 +17,9 @@ The local Node service. Owns:
|
|
|
17
17
|
| `detect.ts` | `detectAgents()`, `resolveBinForAgent()`, `resolveOnPath()` — PATH scanning |
|
|
18
18
|
| `argv.ts` | `buildArgv()` — protocol-aware argv construction, throws `UnsupportedAgentProtocolError` for `acp` / `pi-rpc` |
|
|
19
19
|
| `invoke.ts` | `invokeAgent()` — async-iterable spawning + stdout streaming |
|
|
20
|
-
| `claude.ts` | Claude Code descriptor: `claude -p`, stream-json parser, sandbox flags |
|
|
20
|
+
| `claude.ts` | Claude Code descriptor: `claude -p`, stream-json parser, hard sandbox flags |
|
|
21
|
+
| `codex.ts` | OpenAI Codex CLI descriptor: `codex exec --json`, JSONL parser, soft sandbox (`--sandbox read-only`) |
|
|
22
|
+
| `cursor.ts` | Cursor CLI descriptor (v0.9): stream-JSON / NDJSON parser, soft sandbox |
|
|
21
23
|
|
|
22
24
|
To add an agent: implement an `AgentDescriptor`, register it in `registry.ts`. Done.
|
|
23
25
|
|
|
@@ -43,7 +45,7 @@ pnpm smoke http://localhost:5173/ "log in then add a todo named 'verify hover'"
|
|
|
43
45
|
Environment variables:
|
|
44
46
|
|
|
45
47
|
- `HOVER_CDP` — CDP URL (default `http://localhost:9222`)
|
|
46
|
-
- `HOVER_AGENT` — agent id (omit to auto-detect; tries the user's stated preference, then the first installed agent in registry order — `claude` → `codex` today)
|
|
48
|
+
- `HOVER_AGENT` — agent id (omit to auto-detect; tries the user's stated preference, then the first installed agent in registry order — `claude` → `codex` → `cursor-agent` today)
|
|
47
49
|
- `HOVER_MODEL` — model for the agent (default `sonnet`, much cheaper than opus)
|
|
48
50
|
|
|
49
51
|
## Sandboxing (what the smoke test enforces)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import type { AgentDescriptor, ParserState } from './types.js';
|
|
2
|
+
export declare const aiderAgent: AgentDescriptor;
|
|
3
|
+
/**
|
|
4
|
+
* Test-only escape hatches, same pattern as cursor.ts / codex.ts.
|
|
5
|
+
*/
|
|
6
|
+
export declare const __testing: {
|
|
7
|
+
freshState: () => ParserState;
|
|
8
|
+
resetCounters: (state: ParserState) => void;
|
|
9
|
+
getState: (state: ParserState) => {
|
|
10
|
+
runningLines: number;
|
|
11
|
+
runningSessionId: string | undefined;
|
|
12
|
+
collectedText: string[];
|
|
13
|
+
sawErrorEvent: boolean;
|
|
14
|
+
};
|
|
15
|
+
};
|
|
16
|
+
//# sourceMappingURL=aider.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"aider.d.ts","sourceRoot":"","sources":["../../src/agents/aider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAA8B,WAAW,EAAE,MAAM,YAAY,CAAC;AAsH3F,eAAO,MAAM,UAAU,EAAE,eA0GxB,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,SAAS;sBACJ,WAAW;2BACJ,WAAW;sBAChB,WAAW;;;;;;CAS9B,CAAC"}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
function aiderState(state) {
|
|
2
|
+
if (typeof state.runningLines !== 'number') {
|
|
3
|
+
state.runningLines = 0;
|
|
4
|
+
state.collectedText = [];
|
|
5
|
+
state.sawErrorEvent = false;
|
|
6
|
+
state.runningSessionId = undefined;
|
|
7
|
+
}
|
|
8
|
+
return state;
|
|
9
|
+
}
|
|
10
|
+
function resetAiderCounters(s) {
|
|
11
|
+
s.runningLines = 0;
|
|
12
|
+
s.collectedText = [];
|
|
13
|
+
s.sawErrorEvent = false;
|
|
14
|
+
s.runningSessionId = undefined;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Aider has no system-prompt flag, so we prepend this preface to the user
|
|
18
|
+
* prompt (same approach as cursor.ts). The agent treats it as the leading
|
|
19
|
+
* user-message text.
|
|
20
|
+
*/
|
|
21
|
+
const AIDER_PROMPT_PREFACE = [
|
|
22
|
+
'You are operating in Hover, a browser-testing tool.',
|
|
23
|
+
'Use ONLY the MCP playwright tools (prefixed `mcp__playwright__` / `mcp__hover-playwright__`) to drive the browser.',
|
|
24
|
+
'Do NOT use shell, file-edit, web-search, or any other built-in tool.',
|
|
25
|
+
'Do NOT navigate to a URL the user is already on; check the page state via `browser_snapshot` first.',
|
|
26
|
+
'When the task is complete, emit a short summary and stop.',
|
|
27
|
+
].join(' ');
|
|
28
|
+
/**
|
|
29
|
+
* Lines we treat as noise and drop instead of surfacing as text events.
|
|
30
|
+
* Aider chatters with status lines that would clutter the widget panel.
|
|
31
|
+
* Conservative list — anything we don't explicitly skip falls through.
|
|
32
|
+
*/
|
|
33
|
+
function isNoiseLine(line) {
|
|
34
|
+
const t = line.trim();
|
|
35
|
+
if (!t)
|
|
36
|
+
return true;
|
|
37
|
+
// Common aider boilerplate / banner lines.
|
|
38
|
+
if (/^Aider v\d/i.test(t))
|
|
39
|
+
return true;
|
|
40
|
+
if (/^Main model:/i.test(t))
|
|
41
|
+
return true;
|
|
42
|
+
if (/^Weak model:/i.test(t))
|
|
43
|
+
return true;
|
|
44
|
+
if (/^Git repo:/i.test(t))
|
|
45
|
+
return true;
|
|
46
|
+
if (/^Repo-map:/i.test(t))
|
|
47
|
+
return true;
|
|
48
|
+
if (/^VSCode terminal detected/i.test(t))
|
|
49
|
+
return true;
|
|
50
|
+
if (/^Use \/help/i.test(t))
|
|
51
|
+
return true;
|
|
52
|
+
if (/^Tokens:.*sent.*received/i.test(t))
|
|
53
|
+
return true;
|
|
54
|
+
if (/^─{3,}$/.test(t))
|
|
55
|
+
return true; // horizontal rule
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
function detectErrorLine(line) {
|
|
59
|
+
// Aider prints errors / API failures with a leading marker.
|
|
60
|
+
return /^(error|fatal|api error|litellm.*error)/i.test(line.trim());
|
|
61
|
+
}
|
|
62
|
+
export const aiderAgent = {
|
|
63
|
+
id: 'aider',
|
|
64
|
+
binName: 'aider',
|
|
65
|
+
protocol: 'argv',
|
|
66
|
+
streamFormat: 'plain-text',
|
|
67
|
+
sandboxStrength: 'soft',
|
|
68
|
+
display: {
|
|
69
|
+
label: 'Aider',
|
|
70
|
+
tagline: 'Aider — soft sandbox, plain-text stream, no MCP support',
|
|
71
|
+
homepage: 'https://aider.chat',
|
|
72
|
+
installHint: 'pipx install aider-chat',
|
|
73
|
+
},
|
|
74
|
+
buildArgs(opts) {
|
|
75
|
+
// Prepend HOVER-mode preface plus any caller-supplied appendSystemPrompt
|
|
76
|
+
// to the prompt. Aider has no --append-system-prompt flag, so this is
|
|
77
|
+
// the closest functional analogue (same trick as cursor.ts).
|
|
78
|
+
const preface = opts.appendSystemPrompt && opts.appendSystemPrompt.trim().length > 0
|
|
79
|
+
? `${AIDER_PROMPT_PREFACE} ${opts.appendSystemPrompt}`
|
|
80
|
+
: AIDER_PROMPT_PREFACE;
|
|
81
|
+
const finalPrompt = `${preface}\n\n${opts.prompt}`;
|
|
82
|
+
const args = ['--message', finalPrompt];
|
|
83
|
+
// Auto-confirm every prompt so the run doesn't hang.
|
|
84
|
+
args.push('--yes-always');
|
|
85
|
+
// Make stdout line-buffered instead of character-streamed; friendlier
|
|
86
|
+
// for our per-line parseEvent loop.
|
|
87
|
+
args.push('--no-stream');
|
|
88
|
+
// Defang git side-effects. Aider's default behaviour is to auto-commit
|
|
89
|
+
// every edit it makes; for a browser-driving agent that should never
|
|
90
|
+
// edit files this is still a hazard if cwd is a stale repo.
|
|
91
|
+
args.push('--no-auto-commits');
|
|
92
|
+
args.push('--no-git');
|
|
93
|
+
if (opts.model) {
|
|
94
|
+
args.push('--model', opts.model);
|
|
95
|
+
}
|
|
96
|
+
// Aider's `--restore-chat-history` is a boolean (no session-id form);
|
|
97
|
+
// we deliberately do NOT pass it. `opts.sessionId` is ignored because
|
|
98
|
+
// there is no way to select a specific past session by ID.
|
|
99
|
+
// No equivalents for --max-budget-usd / --allowedTools / --mcp-config /
|
|
100
|
+
// --append-system-prompt — all four are absent from aider.
|
|
101
|
+
return args;
|
|
102
|
+
},
|
|
103
|
+
parseEvent(line, state = {}) {
|
|
104
|
+
const s = aiderState(state);
|
|
105
|
+
const out = [];
|
|
106
|
+
// Emit a synthetic session_start on the very first non-empty line so
|
|
107
|
+
// the widget gets the same shape it expects from JSON-based agents.
|
|
108
|
+
if (!s.runningSessionId) {
|
|
109
|
+
// Cheap unique id; aider has no real session_id we can echo. The
|
|
110
|
+
// Math.random() suffix is load-bearing — two states created in the
|
|
111
|
+
// same millisecond would otherwise collide and break per-invocation
|
|
112
|
+
// session tracking.
|
|
113
|
+
const rand = Math.random().toString(36).slice(2, 8);
|
|
114
|
+
s.runningSessionId = `aider-${Date.now().toString(36)}-${rand}`;
|
|
115
|
+
out.push({ kind: 'session_start', sessionId: s.runningSessionId });
|
|
116
|
+
}
|
|
117
|
+
if (isNoiseLine(line))
|
|
118
|
+
return out;
|
|
119
|
+
if (detectErrorLine(line)) {
|
|
120
|
+
s.sawErrorEvent = true;
|
|
121
|
+
out.push({ kind: 'text', text: line.trim() });
|
|
122
|
+
return out;
|
|
123
|
+
}
|
|
124
|
+
// Treat everything else as assistant text. Aider has no per-tool events
|
|
125
|
+
// so we cannot emit tool_use / tool_result — see file-header doc comment.
|
|
126
|
+
s.runningLines += 1;
|
|
127
|
+
s.collectedText.push(line.trim());
|
|
128
|
+
out.push({ kind: 'text', text: line.trim() });
|
|
129
|
+
return out;
|
|
130
|
+
},
|
|
131
|
+
/**
|
|
132
|
+
* Aider doesn't emit a terminal event — the child process simply exits
|
|
133
|
+
* after the final printed line. Synthesize session_end from accumulated
|
|
134
|
+
* state, same pattern as codex.ts.
|
|
135
|
+
*/
|
|
136
|
+
onStreamEnd(exitCode, state = {}) {
|
|
137
|
+
const s = aiderState(state);
|
|
138
|
+
// The "summary" is the last non-empty text line, which is typically
|
|
139
|
+
// aider's final answer. If we collected nothing, leave it undefined
|
|
140
|
+
// rather than fabricating.
|
|
141
|
+
const lastText = s.collectedText.length > 0
|
|
142
|
+
? s.collectedText[s.collectedText.length - 1]
|
|
143
|
+
: undefined;
|
|
144
|
+
return {
|
|
145
|
+
kind: 'session_end',
|
|
146
|
+
turns: s.runningLines,
|
|
147
|
+
// costUsd intentionally undefined — aider's "Tokens:" status line is
|
|
148
|
+
// ad-hoc text, not a stable API. We don't fabricate a number.
|
|
149
|
+
isError: s.sawErrorEvent || (exitCode != null && exitCode !== 0),
|
|
150
|
+
summary: lastText,
|
|
151
|
+
};
|
|
152
|
+
},
|
|
153
|
+
};
|
|
154
|
+
/**
|
|
155
|
+
* Test-only escape hatches, same pattern as cursor.ts / codex.ts.
|
|
156
|
+
*/
|
|
157
|
+
export const __testing = {
|
|
158
|
+
freshState: () => ({}),
|
|
159
|
+
resetCounters: (state) => resetAiderCounters(aiderState(state)),
|
|
160
|
+
getState: (state) => {
|
|
161
|
+
const s = aiderState(state);
|
|
162
|
+
return {
|
|
163
|
+
runningLines: s.runningLines,
|
|
164
|
+
runningSessionId: s.runningSessionId,
|
|
165
|
+
collectedText: [...s.collectedText],
|
|
166
|
+
sawErrorEvent: s.sawErrorEvent,
|
|
167
|
+
};
|
|
168
|
+
},
|
|
169
|
+
};
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { AgentDescriptor, ParserState } from './types.js';
|
|
2
|
+
export declare const geminiAgent: AgentDescriptor;
|
|
3
|
+
/**
|
|
4
|
+
* Test-only escape hatches, same pattern as cursor.ts / codex.ts.
|
|
5
|
+
*/
|
|
6
|
+
export declare const __testing: {
|
|
7
|
+
freshState: () => ParserState;
|
|
8
|
+
resetCounters: (state: ParserState) => void;
|
|
9
|
+
getState: (state: ParserState) => {
|
|
10
|
+
runningTurns: number;
|
|
11
|
+
runningSessionId: string | undefined;
|
|
12
|
+
runningModel: string | undefined;
|
|
13
|
+
lastAssistantText: string | undefined;
|
|
14
|
+
sawErrorEvent: boolean;
|
|
15
|
+
};
|
|
16
|
+
};
|
|
17
|
+
//# sourceMappingURL=gemini.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"gemini.d.ts","sourceRoot":"","sources":["../../src/agents/gemini.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAA8B,WAAW,EAAE,MAAM,YAAY,CAAC;AA4K3F,eAAO,MAAM,WAAW,EAAE,eAiJzB,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,SAAS;sBACJ,WAAW;2BACJ,WAAW;sBAChB,WAAW;;;;;;;CAU9B,CAAC"}
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
function geminiState(state) {
|
|
2
|
+
if (typeof state.runningTurns !== 'number') {
|
|
3
|
+
state.runningTurns = 0;
|
|
4
|
+
state.runningSessionId = undefined;
|
|
5
|
+
state.runningModel = undefined;
|
|
6
|
+
state.lastAssistantText = undefined;
|
|
7
|
+
state.sawErrorEvent = false;
|
|
8
|
+
state.toolNameByUseId = new Map();
|
|
9
|
+
}
|
|
10
|
+
return state;
|
|
11
|
+
}
|
|
12
|
+
function resetGeminiCounters(s) {
|
|
13
|
+
s.runningTurns = 0;
|
|
14
|
+
s.runningSessionId = undefined;
|
|
15
|
+
s.runningModel = undefined;
|
|
16
|
+
s.lastAssistantText = undefined;
|
|
17
|
+
s.sawErrorEvent = false;
|
|
18
|
+
s.toolNameByUseId.clear();
|
|
19
|
+
}
|
|
20
|
+
/** Strip the `mcp__playwright__` / `mcp__hover-playwright__` prefix so tool
|
|
21
|
+
* names match the normalised names claude / codex / cursor / qwen emit. */
|
|
22
|
+
function stripMcpPrefix(raw) {
|
|
23
|
+
return raw.replace(/^mcp__playwright__/, '').replace(/^mcp__hover-playwright__/, '');
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Extract assistant text from a `message` event whose `content` may be a
|
|
27
|
+
* plain string OR an array of `{type:'text', text}` content blocks. Gemini's
|
|
28
|
+
* docs aren't explicit on which shape ships per build, so we handle both.
|
|
29
|
+
*/
|
|
30
|
+
function extractMessageText(ev) {
|
|
31
|
+
if (typeof ev.content === 'string') {
|
|
32
|
+
const t = ev.content.trim();
|
|
33
|
+
return t.length > 0 ? t : undefined;
|
|
34
|
+
}
|
|
35
|
+
if (Array.isArray(ev.content)) {
|
|
36
|
+
const parts = ev.content
|
|
37
|
+
.filter(b => b.type === 'text' && typeof b.text === 'string')
|
|
38
|
+
.map(b => b.text.trim())
|
|
39
|
+
.filter(t => t.length > 0);
|
|
40
|
+
return parts.length > 0 ? parts.join('\n') : undefined;
|
|
41
|
+
}
|
|
42
|
+
return undefined;
|
|
43
|
+
}
|
|
44
|
+
const GEMINI_PROMPT_PREFACE = [
|
|
45
|
+
'You are operating in Hover, a browser-testing tool.',
|
|
46
|
+
'Use ONLY the MCP playwright tools (prefixed `mcp__playwright__` / `mcp__hover-playwright__`) to drive the browser.',
|
|
47
|
+
'Do NOT use shell, file-edit, web-search, or any other built-in tool.',
|
|
48
|
+
'Do NOT navigate to a URL the user is already on; check the page state via `browser_snapshot` first.',
|
|
49
|
+
'When the task is complete, emit a short summary and stop.',
|
|
50
|
+
].join(' ');
|
|
51
|
+
export const geminiAgent = {
|
|
52
|
+
id: 'gemini',
|
|
53
|
+
binName: 'gemini',
|
|
54
|
+
protocol: 'argv',
|
|
55
|
+
streamFormat: 'json-lines',
|
|
56
|
+
sandboxStrength: 'soft',
|
|
57
|
+
display: {
|
|
58
|
+
label: 'Gemini',
|
|
59
|
+
tagline: 'Google Gemini — soft sandbox (no built-in tool deny-list)',
|
|
60
|
+
homepage: 'https://github.com/google-gemini/gemini-cli',
|
|
61
|
+
installHint: 'npm install -g @google/gemini-cli',
|
|
62
|
+
},
|
|
63
|
+
buildArgs(opts) {
|
|
64
|
+
// Gemini has no --append-system-prompt CLI flag (only the
|
|
65
|
+
// GEMINI_SYSTEM_MD env var which writes a file). Prepend the HOVER-mode
|
|
66
|
+
// preface to the prompt instead — same pattern as cursor.ts / aider.ts.
|
|
67
|
+
const preface = opts.appendSystemPrompt && opts.appendSystemPrompt.trim().length > 0
|
|
68
|
+
? `${GEMINI_PROMPT_PREFACE} ${opts.appendSystemPrompt}`
|
|
69
|
+
: GEMINI_PROMPT_PREFACE;
|
|
70
|
+
const finalPrompt = `${preface}\n\n${opts.prompt}`;
|
|
71
|
+
const args = ['-p', finalPrompt];
|
|
72
|
+
// NDJSON streaming output.
|
|
73
|
+
args.push('--output-format', 'stream-json');
|
|
74
|
+
// Auto-approve all tool calls so the run doesn't hang. The newer
|
|
75
|
+
// canonical form is --approval-mode=yolo; --yolo is deprecated but
|
|
76
|
+
// still accepted in 2026-05. We use the modern form.
|
|
77
|
+
args.push('--approval-mode', 'yolo');
|
|
78
|
+
if (opts.model) {
|
|
79
|
+
args.push('--model', opts.model);
|
|
80
|
+
}
|
|
81
|
+
if (opts.sessionId) {
|
|
82
|
+
// --resume <id> is the documented form. -r is the alias. The single
|
|
83
|
+
// string 'latest' picks the most recent session; we only pass an
|
|
84
|
+
// explicit id, never the literal 'latest'.
|
|
85
|
+
args.push('--resume', opts.sessionId);
|
|
86
|
+
}
|
|
87
|
+
// MCP servers configured via `gemini mcp add` at install time — no
|
|
88
|
+
// per-invocation --mcp-config equivalent.
|
|
89
|
+
// No equivalent for --max-budget-usd or --allowedTools / --disallowedTools
|
|
90
|
+
// in the disable-built-ins sense.
|
|
91
|
+
return args;
|
|
92
|
+
},
|
|
93
|
+
parseEvent(line, state = {}) {
|
|
94
|
+
if (!line.trim())
|
|
95
|
+
return [];
|
|
96
|
+
let ev;
|
|
97
|
+
try {
|
|
98
|
+
ev = JSON.parse(line);
|
|
99
|
+
}
|
|
100
|
+
catch {
|
|
101
|
+
return [{ kind: 'raw', line }];
|
|
102
|
+
}
|
|
103
|
+
const s = geminiState(state);
|
|
104
|
+
const out = [];
|
|
105
|
+
if (ev.type === 'init') {
|
|
106
|
+
resetGeminiCounters(s);
|
|
107
|
+
s.runningModel = ev.model;
|
|
108
|
+
if (ev.session_id) {
|
|
109
|
+
s.runningSessionId = ev.session_id;
|
|
110
|
+
out.push({ kind: 'session_start', sessionId: ev.session_id, model: ev.model });
|
|
111
|
+
}
|
|
112
|
+
return out;
|
|
113
|
+
}
|
|
114
|
+
if (ev.type === 'message') {
|
|
115
|
+
// Only count and surface assistant messages — user echoes (the
|
|
116
|
+
// role:'user' message events) don't count as turns from our POV.
|
|
117
|
+
if (ev.role === 'assistant' || ev.role === undefined) {
|
|
118
|
+
s.runningTurns += 1;
|
|
119
|
+
out.push({ kind: 'usage', turns: s.runningTurns });
|
|
120
|
+
const text = extractMessageText(ev);
|
|
121
|
+
if (text) {
|
|
122
|
+
s.lastAssistantText = text;
|
|
123
|
+
out.push({ kind: 'text', text });
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
return out;
|
|
127
|
+
}
|
|
128
|
+
if (ev.type === 'tool_use') {
|
|
129
|
+
const rawName = ev.name ?? '';
|
|
130
|
+
const tool = stripMcpPrefix(rawName);
|
|
131
|
+
if (ev.id)
|
|
132
|
+
s.toolNameByUseId.set(ev.id, tool);
|
|
133
|
+
out.push({ kind: 'tool_use', tool, input: ev.input });
|
|
134
|
+
return out;
|
|
135
|
+
}
|
|
136
|
+
if (ev.type === 'tool_result') {
|
|
137
|
+
const isError = ev.is_error === true;
|
|
138
|
+
out.push({ kind: 'tool_result', isError });
|
|
139
|
+
return out;
|
|
140
|
+
}
|
|
141
|
+
if (ev.type === 'result') {
|
|
142
|
+
// result.stats may carry turns; prefer it over our running count.
|
|
143
|
+
const turns = ev.stats?.turns ?? ev.stats?.model?.turns ?? s.runningTurns;
|
|
144
|
+
const isError = ev.is_error === true || ev.error !== undefined && ev.error !== null;
|
|
145
|
+
if (isError)
|
|
146
|
+
s.sawErrorEvent = true;
|
|
147
|
+
out.push({
|
|
148
|
+
kind: 'session_end',
|
|
149
|
+
turns,
|
|
150
|
+
// costUsd intentionally undefined — gemini's stats block does not
|
|
151
|
+
// include $ figures.
|
|
152
|
+
isError,
|
|
153
|
+
summary: ev.response ?? s.lastAssistantText,
|
|
154
|
+
});
|
|
155
|
+
return out;
|
|
156
|
+
}
|
|
157
|
+
if (ev.type === 'error') {
|
|
158
|
+
s.sawErrorEvent = true;
|
|
159
|
+
const msg = ev.message ?? ev.error?.message ?? `[gemini] error`;
|
|
160
|
+
out.push({ kind: 'text', text: msg });
|
|
161
|
+
return out;
|
|
162
|
+
}
|
|
163
|
+
return [];
|
|
164
|
+
},
|
|
165
|
+
/**
|
|
166
|
+
* Gemini's `result` event already produces a session_end via parseEvent;
|
|
167
|
+
* this fallback is for the case where the child exits without emitting a
|
|
168
|
+
* `result` (e.g. crash, signal). Same shape as cursor.ts / qwen.ts.
|
|
169
|
+
*/
|
|
170
|
+
onStreamEnd(exitCode, state = {}) {
|
|
171
|
+
const s = geminiState(state);
|
|
172
|
+
return {
|
|
173
|
+
kind: 'session_end',
|
|
174
|
+
turns: s.runningTurns,
|
|
175
|
+
// costUsd intentionally undefined — see parseEvent note.
|
|
176
|
+
isError: s.sawErrorEvent || (exitCode != null && exitCode !== 0),
|
|
177
|
+
summary: s.lastAssistantText,
|
|
178
|
+
};
|
|
179
|
+
},
|
|
180
|
+
};
|
|
181
|
+
/**
|
|
182
|
+
* Test-only escape hatches, same pattern as cursor.ts / codex.ts.
|
|
183
|
+
*/
|
|
184
|
+
export const __testing = {
|
|
185
|
+
freshState: () => ({}),
|
|
186
|
+
resetCounters: (state) => resetGeminiCounters(geminiState(state)),
|
|
187
|
+
getState: (state) => {
|
|
188
|
+
const s = geminiState(state);
|
|
189
|
+
return {
|
|
190
|
+
runningTurns: s.runningTurns,
|
|
191
|
+
runningSessionId: s.runningSessionId,
|
|
192
|
+
runningModel: s.runningModel,
|
|
193
|
+
lastAssistantText: s.lastAssistantText,
|
|
194
|
+
sawErrorEvent: s.sawErrorEvent,
|
|
195
|
+
};
|
|
196
|
+
},
|
|
197
|
+
};
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { AgentDescriptor, ParserState } from './types.js';
|
|
2
|
+
export declare const qwenAgent: AgentDescriptor;
|
|
3
|
+
/**
|
|
4
|
+
* Test-only escape hatches, same pattern as cursor.ts / codex.ts.
|
|
5
|
+
*/
|
|
6
|
+
export declare const __testing: {
|
|
7
|
+
freshState: () => ParserState;
|
|
8
|
+
resetCounters: (state: ParserState) => void;
|
|
9
|
+
getState: (state: ParserState) => {
|
|
10
|
+
runningTurns: number;
|
|
11
|
+
runningSessionId: string | undefined;
|
|
12
|
+
runningModel: string | undefined;
|
|
13
|
+
lastAssistantText: string | undefined;
|
|
14
|
+
sawErrorEvent: boolean;
|
|
15
|
+
};
|
|
16
|
+
};
|
|
17
|
+
//# sourceMappingURL=qwen.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"qwen.d.ts","sourceRoot":"","sources":["../../src/agents/qwen.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAA8B,WAAW,EAAE,MAAM,YAAY,CAAC;AAiK3F,eAAO,MAAM,SAAS,EAAE,eAqJvB,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,SAAS;sBACJ,WAAW;2BACJ,WAAW;sBAChB,WAAW;;;;;;;CAU9B,CAAC"}
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
function qwenState(state) {
|
|
2
|
+
if (typeof state.runningTurns !== 'number') {
|
|
3
|
+
state.runningTurns = 0;
|
|
4
|
+
state.runningSessionId = undefined;
|
|
5
|
+
state.runningModel = undefined;
|
|
6
|
+
state.lastAssistantText = undefined;
|
|
7
|
+
state.sawErrorEvent = false;
|
|
8
|
+
state.toolNameByUseId = new Map();
|
|
9
|
+
}
|
|
10
|
+
return state;
|
|
11
|
+
}
|
|
12
|
+
function resetQwenCounters(s) {
|
|
13
|
+
s.runningTurns = 0;
|
|
14
|
+
s.runningSessionId = undefined;
|
|
15
|
+
s.runningModel = undefined;
|
|
16
|
+
s.lastAssistantText = undefined;
|
|
17
|
+
s.sawErrorEvent = false;
|
|
18
|
+
s.toolNameByUseId.clear();
|
|
19
|
+
}
|
|
20
|
+
/** Strip the `mcp__playwright__` / `mcp__hover-playwright__` prefix so tool
|
|
21
|
+
* names match the normalised names claude / codex / cursor emit. */
|
|
22
|
+
function stripMcpPrefix(raw) {
|
|
23
|
+
return raw.replace(/^mcp__playwright__/, '').replace(/^mcp__hover-playwright__/, '');
|
|
24
|
+
}
|
|
25
|
+
const QWEN_PROMPT_PREFACE = [
|
|
26
|
+
'You are operating in Hover, a browser-testing tool.',
|
|
27
|
+
'Use ONLY the MCP playwright tools (prefixed `mcp__playwright__` / `mcp__hover-playwright__`) to drive the browser.',
|
|
28
|
+
'Do NOT use shell, file-edit, web-search, or any other built-in tool.',
|
|
29
|
+
'Do NOT navigate to a URL the user is already on; check the page state via `browser_snapshot` first.',
|
|
30
|
+
'When the task is complete, emit a short summary and stop.',
|
|
31
|
+
].join(' ');
|
|
32
|
+
export const qwenAgent = {
|
|
33
|
+
id: 'qwen',
|
|
34
|
+
binName: 'qwen',
|
|
35
|
+
protocol: 'argv',
|
|
36
|
+
streamFormat: 'json-lines',
|
|
37
|
+
sandboxStrength: 'soft',
|
|
38
|
+
display: {
|
|
39
|
+
label: 'Qwen Code',
|
|
40
|
+
tagline: 'Qwen Code — soft sandbox (no built-in tool deny-list)',
|
|
41
|
+
homepage: 'https://github.com/QwenLM/qwen-code',
|
|
42
|
+
installHint: 'npm install -g @qwen-code/qwen-code@latest',
|
|
43
|
+
},
|
|
44
|
+
buildArgs(opts) {
|
|
45
|
+
const args = ['-p', opts.prompt];
|
|
46
|
+
// NDJSON streaming output.
|
|
47
|
+
args.push('--output-format', 'stream-json');
|
|
48
|
+
// Auto-approve all tool calls so the run doesn't hang. The newer
|
|
49
|
+
// canonical form is --approval-mode=yolo; --yolo is deprecated but
|
|
50
|
+
// still works in 2026-05. We use the modern form.
|
|
51
|
+
args.push('--approval-mode', 'yolo');
|
|
52
|
+
if (opts.model) {
|
|
53
|
+
args.push('--model', opts.model);
|
|
54
|
+
}
|
|
55
|
+
if (opts.sessionId) {
|
|
56
|
+
// --resume <sessionId> is the documented headless form. --continue
|
|
57
|
+
// (no arg) picks the most recent — NOT what we want when a specific
|
|
58
|
+
// session was passed.
|
|
59
|
+
args.push('--resume', opts.sessionId);
|
|
60
|
+
}
|
|
61
|
+
// Qwen has a real --append-system-prompt flag — use it instead of
|
|
62
|
+
// prepending to the user prompt. Concatenate the standing Hover-mode
|
|
63
|
+
// preface with whatever the caller appended.
|
|
64
|
+
const sysPrompt = opts.appendSystemPrompt && opts.appendSystemPrompt.trim().length > 0
|
|
65
|
+
? `${QWEN_PROMPT_PREFACE} ${opts.appendSystemPrompt}`
|
|
66
|
+
: QWEN_PROMPT_PREFACE;
|
|
67
|
+
args.push('--append-system-prompt', sysPrompt);
|
|
68
|
+
// MCP servers configured in ~/.qwen/settings.json — no per-invocation
|
|
69
|
+
// --mcp-config equivalent. Same constraint as cursor / codex.
|
|
70
|
+
// No equivalent for --max-budget-usd or --allowedTools / --disallowedTools.
|
|
71
|
+
return args;
|
|
72
|
+
},
|
|
73
|
+
parseEvent(line, state = {}) {
|
|
74
|
+
if (!line.trim())
|
|
75
|
+
return [];
|
|
76
|
+
let ev;
|
|
77
|
+
try {
|
|
78
|
+
ev = JSON.parse(line);
|
|
79
|
+
}
|
|
80
|
+
catch {
|
|
81
|
+
return [{ kind: 'raw', line }];
|
|
82
|
+
}
|
|
83
|
+
const s = qwenState(state);
|
|
84
|
+
const out = [];
|
|
85
|
+
if (ev.type === 'system' && ev.subtype === 'session_start') {
|
|
86
|
+
resetQwenCounters(s);
|
|
87
|
+
s.runningModel = ev.model;
|
|
88
|
+
if (ev.session_id) {
|
|
89
|
+
s.runningSessionId = ev.session_id;
|
|
90
|
+
out.push({ kind: 'session_start', sessionId: ev.session_id, model: ev.model });
|
|
91
|
+
}
|
|
92
|
+
return out;
|
|
93
|
+
}
|
|
94
|
+
if (ev.type === 'assistant' && ev.message) {
|
|
95
|
+
s.runningTurns += 1;
|
|
96
|
+
// Emit a usage event so the widget can advance its turn counter.
|
|
97
|
+
// costUsd intentionally omitted — qwen doesn't publish $ in stream.
|
|
98
|
+
out.push({ kind: 'usage', turns: s.runningTurns });
|
|
99
|
+
for (const block of ev.message.content ?? []) {
|
|
100
|
+
if (block.type === 'text') {
|
|
101
|
+
const text = block.text?.trim();
|
|
102
|
+
if (text) {
|
|
103
|
+
s.lastAssistantText = text;
|
|
104
|
+
out.push({ kind: 'text', text });
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
else if (block.type === 'tool_use') {
|
|
108
|
+
const rawName = block.name ?? '';
|
|
109
|
+
const tool = stripMcpPrefix(rawName);
|
|
110
|
+
if (block.id)
|
|
111
|
+
s.toolNameByUseId.set(block.id, tool);
|
|
112
|
+
out.push({ kind: 'tool_use', tool, input: block.input });
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
return out;
|
|
116
|
+
}
|
|
117
|
+
// tool_result blocks are wrapped in `user` messages (Anthropic Messages
|
|
118
|
+
// convention). We surface them as tool_result events.
|
|
119
|
+
if (ev.type === 'user' && ev.message) {
|
|
120
|
+
for (const block of ev.message.content ?? []) {
|
|
121
|
+
if (block.type === 'tool_result') {
|
|
122
|
+
const isError = block.is_error === true;
|
|
123
|
+
out.push({ kind: 'tool_result', isError });
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
return out;
|
|
127
|
+
}
|
|
128
|
+
if (ev.type === 'result') {
|
|
129
|
+
const isError = ev.is_error === true ||
|
|
130
|
+
(typeof ev.subtype === 'string' && /error|fail/i.test(ev.subtype));
|
|
131
|
+
if (isError)
|
|
132
|
+
s.sawErrorEvent = true;
|
|
133
|
+
out.push({
|
|
134
|
+
kind: 'session_end',
|
|
135
|
+
turns: s.runningTurns,
|
|
136
|
+
// costUsd intentionally undefined — qwen doesn't publish $.
|
|
137
|
+
isError,
|
|
138
|
+
summary: ev.result ?? s.lastAssistantText,
|
|
139
|
+
});
|
|
140
|
+
return out;
|
|
141
|
+
}
|
|
142
|
+
// Qwen emits various error envelopes mid-stream; surface them as text.
|
|
143
|
+
if (ev.type && /error/i.test(ev.type)) {
|
|
144
|
+
s.sawErrorEvent = true;
|
|
145
|
+
const msg = ev.error?.message ?? ev.text ?? ev.result ?? `[qwen] ${ev.type}`;
|
|
146
|
+
out.push({ kind: 'text', text: msg });
|
|
147
|
+
return out;
|
|
148
|
+
}
|
|
149
|
+
return [];
|
|
150
|
+
},
|
|
151
|
+
/**
|
|
152
|
+
* Qwen's `result` event already produces a session_end via parseEvent;
|
|
153
|
+
* this fallback is for the case where the child exits without emitting a
|
|
154
|
+
* `result` (e.g. crash, signal). Mirrors codex.ts / cursor.ts shape.
|
|
155
|
+
*/
|
|
156
|
+
onStreamEnd(exitCode, state = {}) {
|
|
157
|
+
const s = qwenState(state);
|
|
158
|
+
return {
|
|
159
|
+
kind: 'session_end',
|
|
160
|
+
turns: s.runningTurns,
|
|
161
|
+
// costUsd intentionally undefined — see parseEvent note.
|
|
162
|
+
isError: s.sawErrorEvent || (exitCode != null && exitCode !== 0),
|
|
163
|
+
summary: s.lastAssistantText,
|
|
164
|
+
};
|
|
165
|
+
},
|
|
166
|
+
};
|
|
167
|
+
/**
|
|
168
|
+
* Test-only escape hatches, same pattern as cursor.ts / codex.ts.
|
|
169
|
+
*/
|
|
170
|
+
export const __testing = {
|
|
171
|
+
freshState: () => ({}),
|
|
172
|
+
resetCounters: (state) => resetQwenCounters(qwenState(state)),
|
|
173
|
+
getState: (state) => {
|
|
174
|
+
const s = qwenState(state);
|
|
175
|
+
return {
|
|
176
|
+
runningTurns: s.runningTurns,
|
|
177
|
+
runningSessionId: s.runningSessionId,
|
|
178
|
+
runningModel: s.runningModel,
|
|
179
|
+
lastAssistantText: s.lastAssistantText,
|
|
180
|
+
sawErrorEvent: s.sawErrorEvent,
|
|
181
|
+
};
|
|
182
|
+
},
|
|
183
|
+
};
|
|
@@ -2,13 +2,15 @@ import type { AgentDescriptor } from './types.js';
|
|
|
2
2
|
/**
|
|
3
3
|
* Registry of agents Hover can drive.
|
|
4
4
|
*
|
|
5
|
-
* To add support for another agent (e.g.
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
5
|
+
* To add support for another agent (e.g. cline, continue, kilo), implement
|
|
6
|
+
* its AgentDescriptor in its own file and register it here. The rest of the
|
|
7
|
+
* system — detect, argv, invoke, service, widget — works without further
|
|
8
|
+
* changes.
|
|
9
9
|
*
|
|
10
10
|
* Insertion order is the order shown in the widget's agent dropdown, so put
|
|
11
|
-
* the recommended primary first.
|
|
11
|
+
* the recommended primary first. The two hard-sandbox / first-party agents
|
|
12
|
+
* (claude, codex) lead; the soft-sandbox third-party agents follow in the
|
|
13
|
+
* order they were added.
|
|
12
14
|
*/
|
|
13
15
|
export declare const AGENTS: Record<string, AgentDescriptor>;
|
|
14
16
|
export declare function getAgent(id: string): AgentDescriptor | undefined;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"registry.d.ts","sourceRoot":"","sources":["../../src/agents/registry.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;
|
|
1
|
+
{"version":3,"file":"registry.d.ts","sourceRoot":"","sources":["../../src/agents/registry.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAQlD;;;;;;;;;;;;GAYG;AACH,eAAO,MAAM,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,eAAe,CAOlD,CAAC;AAEF,wBAAgB,QAAQ,CAAC,EAAE,EAAE,MAAM,GAAG,eAAe,GAAG,SAAS,CAEhE;AAED,+DAA+D;AAC/D,wBAAgB,UAAU,IAAI,eAAe,EAAE,CAE9C"}
|
package/dist/agents/registry.js
CHANGED
|
@@ -1,21 +1,29 @@
|
|
|
1
1
|
import { claudeAgent } from './claude.js';
|
|
2
2
|
import { codexAgent } from './codex.js';
|
|
3
3
|
import { cursorAgent } from './cursor.js';
|
|
4
|
+
import { aiderAgent } from './aider.js';
|
|
5
|
+
import { geminiAgent } from './gemini.js';
|
|
6
|
+
import { qwenAgent } from './qwen.js';
|
|
4
7
|
/**
|
|
5
8
|
* Registry of agents Hover can drive.
|
|
6
9
|
*
|
|
7
|
-
* To add support for another agent (e.g.
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
10
|
+
* To add support for another agent (e.g. cline, continue, kilo), implement
|
|
11
|
+
* its AgentDescriptor in its own file and register it here. The rest of the
|
|
12
|
+
* system — detect, argv, invoke, service, widget — works without further
|
|
13
|
+
* changes.
|
|
11
14
|
*
|
|
12
15
|
* Insertion order is the order shown in the widget's agent dropdown, so put
|
|
13
|
-
* the recommended primary first.
|
|
16
|
+
* the recommended primary first. The two hard-sandbox / first-party agents
|
|
17
|
+
* (claude, codex) lead; the soft-sandbox third-party agents follow in the
|
|
18
|
+
* order they were added.
|
|
14
19
|
*/
|
|
15
20
|
export const AGENTS = {
|
|
16
21
|
[claudeAgent.id]: claudeAgent,
|
|
17
22
|
[codexAgent.id]: codexAgent,
|
|
18
23
|
[cursorAgent.id]: cursorAgent,
|
|
24
|
+
[aiderAgent.id]: aiderAgent,
|
|
25
|
+
[geminiAgent.id]: geminiAgent,
|
|
26
|
+
[qwenAgent.id]: qwenAgent,
|
|
19
27
|
};
|
|
20
28
|
export function getAgent(id) {
|
|
21
29
|
return AGENTS[id];
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bench-multi-tab.d.ts","sourceRoot":"","sources":["../../src/scripts/bench-multi-tab.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Benchmark agent success rate on the multi-tab "Pay with PayHover" flow.
|
|
3
|
+
*
|
|
4
|
+
* Why this exists — v0.10's central theme is "agent can drive cross-tab
|
|
5
|
+
* flows in the wild." The system-prompt addendum (cdpHint.ts rule 5/6/7)
|
|
6
|
+
* is the lever we're tuning. This script gives us a number to tune
|
|
7
|
+
* against: across N iterations, how often does the agent get from
|
|
8
|
+
* "browse the store" to "Order placed"?
|
|
9
|
+
*
|
|
10
|
+
* Per iteration the agent has to:
|
|
11
|
+
* 1. Browse the e-commerce store, add 1+ items to cart, go to checkout.
|
|
12
|
+
* 2. Fill the shipping form.
|
|
13
|
+
* 3. Pick "Pay with PayHover" (opens a new tab at localhost:5177).
|
|
14
|
+
* 4. Switch to the new tab, fill card number + CVV, click Continue.
|
|
15
|
+
* 5. Wait ~600ms for the simulated 3DS pre-check.
|
|
16
|
+
* 6. Fill the 6-digit OTP (always 123456 in the sandbox).
|
|
17
|
+
* 7. Click Confirm. The provider tab closes itself.
|
|
18
|
+
* 8. Switch back to the original tab, observe the "Order placed" view.
|
|
19
|
+
*
|
|
20
|
+
* Steps 3, 4, 7, 8 are the failure-prone ones.
|
|
21
|
+
*
|
|
22
|
+
* Assumes:
|
|
23
|
+
* - Debug Chrome on :9222 (run `pnpm smoke:chrome`).
|
|
24
|
+
* - e-commerce on :5174 AND payment-provider on :5177 both running
|
|
25
|
+
* (run `pnpm dev:example:e-commerce` and `pnpm dev:example:payment-provider`
|
|
26
|
+
* in two terminals before invoking this).
|
|
27
|
+
*
|
|
28
|
+
* Usage:
|
|
29
|
+
* pnpm --filter @hover-dev/core exec tsx src/scripts/bench-multi-tab.ts [n]
|
|
30
|
+
* pnpm bench-multi-tab [n]
|
|
31
|
+
*
|
|
32
|
+
* `n` defaults to 5. Per-iteration timeout is 5 minutes — multi-tab flows
|
|
33
|
+
* are slow because the agent does a lot of browser_snapshot calls.
|
|
34
|
+
*
|
|
35
|
+
* Output: per-run pass/fail + final summary (success rate, median wall
|
|
36
|
+
* time, median turns, median cost in $). A/B prompt changes by running
|
|
37
|
+
* once on each branch and comparing.
|
|
38
|
+
*/
|
|
39
|
+
import { WebSocket } from 'ws';
|
|
40
|
+
import { startService } from '../service.js';
|
|
41
|
+
const PROMPT = process.env.HOVER_BENCH_PROMPT ??
|
|
42
|
+
[
|
|
43
|
+
'Open http://localhost:5174 (Hover Store).',
|
|
44
|
+
'Add any item to the cart, go to checkout, fill the shipping form with',
|
|
45
|
+
'realistic values, then choose "Pay with PayHover". A new tab opens at',
|
|
46
|
+
'the payment provider — switch to it, fill in card 4242 4242 4242 4242',
|
|
47
|
+
'with CVV 123, click Continue, wait for the OTP step, enter 123456,',
|
|
48
|
+
'click Confirm. The popup will close. Switch back to the original tab',
|
|
49
|
+
'and verify the order shows as placed.',
|
|
50
|
+
].join(' ');
|
|
51
|
+
const ITERATIONS = Number(process.argv[2] ?? 5);
|
|
52
|
+
const PER_RUN_TIMEOUT_MS = 5 * 60 * 1000;
|
|
53
|
+
async function singleRun(idx) {
|
|
54
|
+
process.stderr.write(`\n[bench-multi-tab] run ${idx + 1}/${ITERATIONS}\n`);
|
|
55
|
+
const service = await startService({
|
|
56
|
+
port: 0,
|
|
57
|
+
agentId: 'claude',
|
|
58
|
+
model: 'sonnet',
|
|
59
|
+
cdpUrl: 'http://localhost:9222',
|
|
60
|
+
devRoot: process.cwd(),
|
|
61
|
+
});
|
|
62
|
+
return new Promise((resolve) => {
|
|
63
|
+
const ws = new WebSocket(`ws://127.0.0.1:${service.port}`);
|
|
64
|
+
const t0 = performance.now();
|
|
65
|
+
let turns = 0;
|
|
66
|
+
let costUsd = null;
|
|
67
|
+
let resolved = false;
|
|
68
|
+
const finish = (result) => {
|
|
69
|
+
if (resolved)
|
|
70
|
+
return;
|
|
71
|
+
resolved = true;
|
|
72
|
+
try {
|
|
73
|
+
ws.close(1000);
|
|
74
|
+
}
|
|
75
|
+
catch { /* already closed */ }
|
|
76
|
+
service.close().finally(() => resolve(result));
|
|
77
|
+
};
|
|
78
|
+
const timeout = setTimeout(() => {
|
|
79
|
+
finish({
|
|
80
|
+
ok: false,
|
|
81
|
+
wallMs: performance.now() - t0,
|
|
82
|
+
turns,
|
|
83
|
+
costUsd,
|
|
84
|
+
reason: `timed out after ${PER_RUN_TIMEOUT_MS / 1000}s`,
|
|
85
|
+
});
|
|
86
|
+
}, PER_RUN_TIMEOUT_MS);
|
|
87
|
+
ws.on('open', () => {
|
|
88
|
+
ws.send(JSON.stringify({ type: 'command', payload: { text: PROMPT } }));
|
|
89
|
+
});
|
|
90
|
+
ws.on('message', (raw) => {
|
|
91
|
+
let msg;
|
|
92
|
+
try {
|
|
93
|
+
msg = JSON.parse(raw.toString());
|
|
94
|
+
}
|
|
95
|
+
catch {
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
if (msg.type !== 'event')
|
|
99
|
+
return;
|
|
100
|
+
const ev = msg.payload;
|
|
101
|
+
if (ev.kind === 'tool_use') {
|
|
102
|
+
turns += 1;
|
|
103
|
+
if (process.env.HOVER_BENCH_VERBOSE === '1') {
|
|
104
|
+
const ev2 = ev;
|
|
105
|
+
process.stderr.write(` [turn ${turns}] ${ev2.name ?? '<tool>'}\n`);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
if (ev.kind === 'session_end') {
|
|
109
|
+
clearTimeout(timeout);
|
|
110
|
+
const evAny = ev;
|
|
111
|
+
if (typeof evAny.costUsd === 'number')
|
|
112
|
+
costUsd = evAny.costUsd;
|
|
113
|
+
finish({
|
|
114
|
+
ok: !evAny.isError,
|
|
115
|
+
wallMs: performance.now() - t0,
|
|
116
|
+
turns,
|
|
117
|
+
costUsd,
|
|
118
|
+
reason: evAny.isError ? 'agent reported error' : undefined,
|
|
119
|
+
});
|
|
120
|
+
}
|
|
121
|
+
});
|
|
122
|
+
ws.on('error', (err) => {
|
|
123
|
+
clearTimeout(timeout);
|
|
124
|
+
finish({
|
|
125
|
+
ok: false,
|
|
126
|
+
wallMs: performance.now() - t0,
|
|
127
|
+
turns,
|
|
128
|
+
costUsd,
|
|
129
|
+
reason: `WS error: ${err.message}`,
|
|
130
|
+
});
|
|
131
|
+
});
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
function median(xs) {
|
|
135
|
+
if (xs.length === 0)
|
|
136
|
+
return 0;
|
|
137
|
+
const sorted = [...xs].sort((a, b) => a - b);
|
|
138
|
+
const mid = Math.floor(sorted.length / 2);
|
|
139
|
+
return sorted.length % 2 === 0
|
|
140
|
+
? (sorted[mid - 1] + sorted[mid]) / 2
|
|
141
|
+
: sorted[mid];
|
|
142
|
+
}
|
|
143
|
+
function fmtMs(ms) {
|
|
144
|
+
return `${(ms / 1000).toFixed(1)}s`;
|
|
145
|
+
}
|
|
146
|
+
function fmtUsd(usd) {
|
|
147
|
+
return usd == null ? '–' : `$${usd.toFixed(4)}`;
|
|
148
|
+
}
|
|
149
|
+
async function main() {
|
|
150
|
+
process.stderr.write(`[bench-multi-tab] ${ITERATIONS} iterations, per-run timeout ${PER_RUN_TIMEOUT_MS / 1000}s\n`);
|
|
151
|
+
process.stderr.write(`[bench-multi-tab] prompt: ${PROMPT.slice(0, 80)}…\n`);
|
|
152
|
+
const results = [];
|
|
153
|
+
for (let i = 0; i < ITERATIONS; i++) {
|
|
154
|
+
try {
|
|
155
|
+
const r = await singleRun(i);
|
|
156
|
+
results.push(r);
|
|
157
|
+
const status = r.ok ? '✓ PASS' : '✗ FAIL';
|
|
158
|
+
process.stderr.write(`[bench-multi-tab] run ${i + 1}: ${status} · ${fmtMs(r.wallMs)} · ${r.turns} turns · ${fmtUsd(r.costUsd)}${r.reason ? ` · ${r.reason}` : ''}\n`);
|
|
159
|
+
}
|
|
160
|
+
catch (err) {
|
|
161
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
162
|
+
results.push({ ok: false, wallMs: 0, turns: 0, costUsd: null, reason: msg });
|
|
163
|
+
process.stderr.write(`[bench-multi-tab] run ${i + 1}: ✗ FAIL · setup error · ${msg}\n`);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
const passes = results.filter((r) => r.ok);
|
|
167
|
+
const successRate = passes.length / results.length;
|
|
168
|
+
process.stderr.write('\n[bench-multi-tab] summary\n');
|
|
169
|
+
process.stderr.write(` success rate: ${(successRate * 100).toFixed(0)}% (${passes.length}/${results.length})\n`);
|
|
170
|
+
if (passes.length > 0) {
|
|
171
|
+
process.stderr.write(` median wall: ${fmtMs(median(passes.map((r) => r.wallMs)))}\n`);
|
|
172
|
+
process.stderr.write(` median turns: ${median(passes.map((r) => r.turns)).toFixed(0)}\n`);
|
|
173
|
+
const costs = passes.map((r) => r.costUsd).filter((c) => c != null);
|
|
174
|
+
if (costs.length > 0) {
|
|
175
|
+
process.stderr.write(` median cost: ${fmtUsd(median(costs))}\n`);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
if (passes.length < results.length) {
|
|
179
|
+
process.stderr.write(`\n failures:\n`);
|
|
180
|
+
results.forEach((r, i) => {
|
|
181
|
+
if (!r.ok)
|
|
182
|
+
process.stderr.write(` run ${i + 1}: ${r.reason ?? 'unknown'}\n`);
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
// Exit non-zero if EVERY run failed — useful for CI plumbing later. A
|
|
186
|
+
// partial-pass run still exits 0 so we collect signal across branches.
|
|
187
|
+
process.exit(passes.length === 0 ? 1 : 0);
|
|
188
|
+
}
|
|
189
|
+
main().catch((err) => {
|
|
190
|
+
process.stderr.write(`[bench-multi-tab] fatal: ${err instanceof Error ? err.stack ?? err.message : String(err)}\n`);
|
|
191
|
+
process.exit(1);
|
|
192
|
+
});
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cdpHint.d.ts","sourceRoot":"","sources":["../../src/service/cdpHint.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,UAAU,GAAG;IAAG,GAAG,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAA;CAAE;AAa7C,wBAAgB,YAAY,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,MAAM,
|
|
1
|
+
{"version":3,"file":"cdpHint.d.ts","sourceRoot":"","sources":["../../src/service/cdpHint.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,UAAU,GAAG;IAAG,GAAG,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAA;CAAE;AAa7C,wBAAgB,YAAY,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,MAAM,CAmIhD;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,MAAM,CAYtD"}
|
package/dist/service/cdpHint.js
CHANGED
|
@@ -101,6 +101,45 @@ export function buildCdpHint(tabs) {
|
|
|
101
101
|
` 4. To see the current page state, call browser_snapshot first. Only`,
|
|
102
102
|
` navigate if you actually need a different URL.`,
|
|
103
103
|
``,
|
|
104
|
+
`Multi-tab + cross-origin flows (Stripe Checkout, OAuth login, "Pay with X" popups):`,
|
|
105
|
+
``,
|
|
106
|
+
` 5. When you click something that may open a new tab (target=_blank, a`,
|
|
107
|
+
` window.open trigger, a "Pay with …" / "Sign in with …" button), the`,
|
|
108
|
+
` popup tab is where the next user-visible step happens — but your tools`,
|
|
109
|
+
` stay anchored to the prior tab until you switch. After such a click:`,
|
|
110
|
+
``,
|
|
111
|
+
` a) Call browser_tabs(action='list') to see if a new tab appeared.`,
|
|
112
|
+
` A new entry at a different origin is the popup.`,
|
|
113
|
+
` b) Call browser_tabs(action='select', idx=<popup idx>) to focus it,`,
|
|
114
|
+
` then browser_snapshot the new tab and proceed.`,
|
|
115
|
+
` c) When the popup closes (it usually does so on success/cancel —`,
|
|
116
|
+
` window.close() or after a redirect chain), browser_tabs(list)`,
|
|
117
|
+
` will no longer show it. The current page may be invalid; call`,
|
|
118
|
+
` browser_tabs(action='select', idx=0) to refocus the original tab,`,
|
|
119
|
+
` then browser_snapshot it. The original tab's DOM may have updated`,
|
|
120
|
+
` via a postMessage handler (e.g. it should now show a "Success" or`,
|
|
121
|
+
` "Payment complete" state).`,
|
|
122
|
+
` d) If the original tab's snapshot looks unchanged (still showing the`,
|
|
123
|
+
` checkout form / login button), the postMessage handler may not`,
|
|
124
|
+
` have fired yet or may not exist. Wait once with`,
|
|
125
|
+
` browser_wait_for_text("<expected success copy>", timeout=3000)`,
|
|
126
|
+
` before concluding the flow is broken.`,
|
|
127
|
+
``,
|
|
128
|
+
` 6. OAuth-style redirect chains: when a tab redirects through several`,
|
|
129
|
+
` origins (myapp → identity provider → /callback?code=… → myapp), watch`,
|
|
130
|
+
` browser_tabs after each browser_snapshot — the same tab idx can switch`,
|
|
131
|
+
` origin underneath you. The URL in browser_tabs(list) is authoritative.`,
|
|
132
|
+
``,
|
|
133
|
+
` 7. Cross-origin cookie/session updates: after the popup closes and you're`,
|
|
134
|
+
` back on the original tab, the server-set session cookie may be present`,
|
|
135
|
+
` in the browser but the React state hasn't yet picked it up. The most`,
|
|
136
|
+
` likely cause is a missing or slow postMessage handler — NOT a real`,
|
|
137
|
+
` bug yet. Try browser_wait_for_text once for the expected logged-in`,
|
|
138
|
+
` copy with a 3s timeout. If nothing shows, report it as a Finding`,
|
|
139
|
+
` ("Original tab did not update after popup closed — likely missing`,
|
|
140
|
+
` postMessage listener or auth refresh"); do NOT browser_navigate to`,
|
|
141
|
+
` same-origin to force a refresh (rule #2 still applies).`,
|
|
142
|
+
``,
|
|
104
143
|
`Narration format — affects how the widget renders your run for the user:`,
|
|
105
144
|
``,
|
|
106
145
|
` Before each LOGICAL STEP (a coherent unit of work like "Open the login`,
|
package/dist/service.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"service.d.ts","sourceRoot":"","sources":["../src/service.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"service.d.ts","sourceRoot":"","sources":["../src/service.ts"],"names":[],"mappings":"AA6EA,OAAO,EAEL,KAAK,mBAAmB,EAEzB,MAAM,iBAAiB,CAAC;AAEzB,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,gFAAgF;IAChF,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB;;;6EAGyE;IACzE,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB;;;;uBAImB;IACnB,OAAO,CAAC,EAAE,mBAAmB,EAAE,CAAC;CACjC;AAED,MAAM,WAAW,aAAa;IAC5B;4EACwE;IACxE,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CACxB;AAiDD,wBAAsB,YAAY,CAAC,IAAI,EAAE,cAAc,GAAG,OAAO,CAAC,aAAa,CAAC,CAswB/E"}
|
package/dist/service.js
CHANGED
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
* { type: 'skill-saved', payload: { name, path } }
|
|
14
14
|
* { type: 'skill-exists', payload: { slug, existingPath } }
|
|
15
15
|
* { type: 'skills-list', payload: { skills: SkillSummary[] } }
|
|
16
|
+
* { type: 'specs-list', payload: { specs: SpecSummary[] } }
|
|
16
17
|
* { type: 'spec-saved', payload: { name, path } }
|
|
17
18
|
* { type: 'spec-exists', payload: { slug, existingPath } }
|
|
18
19
|
* { type: 'case-csv-saved', payload: { name, path } }
|
|
@@ -20,7 +21,12 @@
|
|
|
20
21
|
* { type: 'error', payload: { message } }
|
|
21
22
|
*
|
|
22
23
|
* client → server
|
|
23
|
-
* { type: 'command', payload: { text, sessionId
|
|
24
|
+
* { type: 'command', payload: { text, sessionId?, reRecord?: { slug } } }
|
|
25
|
+
* // when reRecord.slug is set, the
|
|
26
|
+
* // service collects tool_use events
|
|
27
|
+
* // into a step list and on a clean
|
|
28
|
+
* // session_end overwrites
|
|
29
|
+
* // __vibe_tests__/<slug>.spec.ts
|
|
24
30
|
* { type: 'cancel' }
|
|
25
31
|
* { type: 'check-cdp', payload: { pageUrl } } // "is this widget in the debug Chrome?"
|
|
26
32
|
* { type: 'launch-chrome', payload: { pageUrl } } // start debug Chrome, navigate to pageUrl
|
|
@@ -29,6 +35,7 @@
|
|
|
29
35
|
* { type: 'save-spec', payload: { name, description, steps, assertions?, overwrite? } }
|
|
30
36
|
* { type: 'save-case-csv', payload: { name, description, steps, assertions?, jiraProjectKey?, labels?, overwrite? } }
|
|
31
37
|
* { type: 'list-skills' }
|
|
38
|
+
* { type: 'list-specs' } // ask for every spec under __vibe_tests__/, with parsed JSDoc headers
|
|
32
39
|
* { type: 'list-agents' } // ask for the full agent registry + install status
|
|
33
40
|
* { type: 'switch-agent', payload: { agentId } } // set the service's current agent; broadcasts to all connections
|
|
34
41
|
*
|
|
@@ -48,6 +55,7 @@ import { getAgent } from './agents/registry.js';
|
|
|
48
55
|
import { getPreflight, invalidatePreflight } from './playwright/preflightCache.js';
|
|
49
56
|
import { resolveMcpConfig } from './playwright/resolveMcpConfig.js';
|
|
50
57
|
import { listSkills } from './skills/writeSkill.js';
|
|
58
|
+
import { listSpecs } from './specs/listSpecs.js';
|
|
51
59
|
import { send, sendIfOpen } from './service/types.js';
|
|
52
60
|
import { buildCdpHint, buildCdpHintResume } from './service/cdpHint.js';
|
|
53
61
|
import { handleCheckCdp, handleLaunchChrome, handleFocusDebug, } from './service/cdpHandlers.js';
|
|
@@ -525,6 +533,15 @@ export async function startService(opts) {
|
|
|
525
533
|
send(ws, { type: 'skills-list', payload: { skills } });
|
|
526
534
|
return;
|
|
527
535
|
}
|
|
536
|
+
if (msg.type === 'list-specs') {
|
|
537
|
+
// Widget asks for every spec under <devRoot>/__vibe_tests__/ so it
|
|
538
|
+
// can render the Specs tab in the Saved-sessions overlay. Each
|
|
539
|
+
// summary carries `originalPrompt` (parsed from the JSDoc header)
|
|
540
|
+
// so the Re-record button can resubmit it as a normal command.
|
|
541
|
+
const specs = await listSpecs(devRoot);
|
|
542
|
+
send(ws, { type: 'specs-list', payload: { specs } });
|
|
543
|
+
return;
|
|
544
|
+
}
|
|
528
545
|
if (msg.type === 'save-spec') {
|
|
529
546
|
await handleSaveArtifact(ws, msg, devRoot, SPEC_CONFIG);
|
|
530
547
|
return;
|
|
@@ -551,6 +568,15 @@ export async function startService(opts) {
|
|
|
551
568
|
const resumeSessionId = typeof msg.payload?.sessionId === 'string' && msg.payload.sessionId.length > 0
|
|
552
569
|
? msg.payload.sessionId
|
|
553
570
|
: undefined;
|
|
571
|
+
// Re-record mode: when the client (widget Specs tab or hover CLI)
|
|
572
|
+
// passes `reRecord: { slug }`, we collect tool_use events server-side
|
|
573
|
+
// into a SkillStep[] and, on session_end with no error, overwrite the
|
|
574
|
+
// existing __vibe_tests__/<slug>.spec.ts. This is the same flow the
|
|
575
|
+
// widget uses for "Save as Spec", but the spec already exists and is
|
|
576
|
+
// being regenerated for the current UI.
|
|
577
|
+
const reRecordSlug = msg.payload && typeof msg.payload === 'object' && 'reRecord' in msg.payload
|
|
578
|
+
? msg.payload.reRecord?.slug
|
|
579
|
+
: undefined;
|
|
554
580
|
if (typeof text !== 'string' || !text.trim())
|
|
555
581
|
return;
|
|
556
582
|
if (busy) {
|
|
@@ -563,6 +589,16 @@ export async function startService(opts) {
|
|
|
563
589
|
busy = true;
|
|
564
590
|
cancelled = false;
|
|
565
591
|
inflight = new AbortController();
|
|
592
|
+
// Re-record step collector — populated as tool_use events stream by,
|
|
593
|
+
// consumed at session_end to overwrite the original spec. Empty unless
|
|
594
|
+
// reRecordSlug is set on this command. We seed with a synthetic
|
|
595
|
+
// `user` step so writeSpec's JSDoc Original-prompt: line carries the
|
|
596
|
+
// text the agent was actually given (which is the prompt we read out
|
|
597
|
+
// of the existing spec — the same one we're regenerating).
|
|
598
|
+
const reRecordSteps = [];
|
|
599
|
+
if (reRecordSlug) {
|
|
600
|
+
reRecordSteps.push({ kind: 'user', text });
|
|
601
|
+
}
|
|
566
602
|
try {
|
|
567
603
|
// Build the MCP config first — it's pure local file IO and lets
|
|
568
604
|
// us assert plugin-contributed servers landed in the config even
|
|
@@ -684,6 +720,61 @@ export async function startService(opts) {
|
|
|
684
720
|
if (cancelled || ws.readyState !== WebSocket.OPEN)
|
|
685
721
|
return;
|
|
686
722
|
send(ws, { type: 'event', payload: ev });
|
|
723
|
+
// Re-record collection. Mirror what widget client.js does on the
|
|
724
|
+
// way past tool_use events: accumulate into a SkillStep[] so we
|
|
725
|
+
// can write a fresh spec when the session ends. We do this only
|
|
726
|
+
// when this command was launched in re-record mode; ordinary
|
|
727
|
+
// commands don't need server-side step retention (widget owns
|
|
728
|
+
// that for normal saves).
|
|
729
|
+
if (reRecordSlug && ev.kind === 'tool_use') {
|
|
730
|
+
reRecordSteps.push({
|
|
731
|
+
kind: 'step',
|
|
732
|
+
tool: ev.tool,
|
|
733
|
+
input: ev.input,
|
|
734
|
+
});
|
|
735
|
+
}
|
|
736
|
+
if (reRecordSlug && ev.kind === 'session_end') {
|
|
737
|
+
// Cancelled or errored runs: don't overwrite — the existing
|
|
738
|
+
// spec is still valid. Tell the client what happened.
|
|
739
|
+
if (ev.isError) {
|
|
740
|
+
sendIfOpen(ws, {
|
|
741
|
+
type: 'error',
|
|
742
|
+
payload: {
|
|
743
|
+
message: `Re-record failed: ${ev.summary ?? 'agent reported an error'}. ` +
|
|
744
|
+
`Original spec left unchanged.`,
|
|
745
|
+
},
|
|
746
|
+
});
|
|
747
|
+
}
|
|
748
|
+
else {
|
|
749
|
+
// Snapshot the agent's final summary into a synthetic `done`
|
|
750
|
+
// step so writeSpec's `Outcome:` header reflects the new run.
|
|
751
|
+
if (ev.summary) {
|
|
752
|
+
reRecordSteps.push({ kind: 'done', summary: ev.summary });
|
|
753
|
+
}
|
|
754
|
+
// Overwrite. writeSpec uses the slug to name the file; we
|
|
755
|
+
// pass the original slug verbatim so the path is stable.
|
|
756
|
+
try {
|
|
757
|
+
const { writeSpec } = await import('./specs/writeSpec.js');
|
|
758
|
+
const result = await writeSpec({
|
|
759
|
+
devRoot,
|
|
760
|
+
name: reRecordSlug,
|
|
761
|
+
steps: reRecordSteps,
|
|
762
|
+
overwrite: true,
|
|
763
|
+
});
|
|
764
|
+
sendIfOpen(ws, {
|
|
765
|
+
type: 'spec-saved',
|
|
766
|
+
payload: { name: reRecordSlug, path: result.path },
|
|
767
|
+
});
|
|
768
|
+
}
|
|
769
|
+
catch (e) {
|
|
770
|
+
const m = e instanceof Error ? e.message : String(e);
|
|
771
|
+
sendIfOpen(ws, {
|
|
772
|
+
type: 'error',
|
|
773
|
+
payload: { message: `Re-record could not write spec: ${m}` },
|
|
774
|
+
});
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
}
|
|
687
778
|
}
|
|
688
779
|
}
|
|
689
780
|
catch (err) {
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
export interface SpecSummary {
|
|
2
|
+
/** Path-relative slug, e.g. `login-and-counter`. Identifies the spec. */
|
|
3
|
+
slug: string;
|
|
4
|
+
/** Absolute path to the .spec.ts file. */
|
|
5
|
+
path: string;
|
|
6
|
+
/** `Original prompt:` parsed from the JSDoc header. `null` for
|
|
7
|
+
* hand-authored specs that have no header — they list but can't be
|
|
8
|
+
* re-recorded automatically. */
|
|
9
|
+
originalPrompt: string | null;
|
|
10
|
+
/** First line of `Outcome:` from the JSDoc header, if present. */
|
|
11
|
+
outcome: string | null;
|
|
12
|
+
/** Number of `Steps:` lines parsed (informational only). */
|
|
13
|
+
stepCount: number;
|
|
14
|
+
/** File mtime in ms — used to show "saved 2 hours ago" in the UI. */
|
|
15
|
+
mtimeMs: number;
|
|
16
|
+
}
|
|
17
|
+
export interface SpecHeader {
|
|
18
|
+
/** Raw text of `Original prompt:` line, or null when absent. */
|
|
19
|
+
originalPrompt: string | null;
|
|
20
|
+
/** First line of `Outcome:`. */
|
|
21
|
+
outcome: string | null;
|
|
22
|
+
/** Step lines from the `Steps:` block, in order. */
|
|
23
|
+
steps: string[];
|
|
24
|
+
/** Lines from the `Expected:` block, in order. */
|
|
25
|
+
expected: string[];
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Parse the JSDoc header that `writeSpec.ts` emits. Tolerant of:
|
|
29
|
+
* - Specs without any JSDoc (returns all-null).
|
|
30
|
+
* - Hand-edited specs where users reordered or trimmed sections.
|
|
31
|
+
* - Long prompts that wrap across lines (we take only the first line).
|
|
32
|
+
*/
|
|
33
|
+
export declare function parseSpecHeader(source: string): SpecHeader;
|
|
34
|
+
/**
|
|
35
|
+
* List every `*.spec.ts` file under `<devRoot>/__vibe_tests__/` with its
|
|
36
|
+
* parsed header. Returns newest-first by mtime so the widget overlay shows
|
|
37
|
+
* recently-saved specs at the top.
|
|
38
|
+
*/
|
|
39
|
+
export declare function listSpecs(devRoot: string): Promise<SpecSummary[]>;
|
|
40
|
+
//# sourceMappingURL=listSpecs.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"listSpecs.d.ts","sourceRoot":"","sources":["../../src/specs/listSpecs.ts"],"names":[],"mappings":"AAkBA,MAAM,WAAW,WAAW;IAC1B,yEAAyE;IACzE,IAAI,EAAE,MAAM,CAAC;IACb,0CAA0C;IAC1C,IAAI,EAAE,MAAM,CAAC;IACb;;qCAEiC;IACjC,cAAc,EAAE,MAAM,GAAG,IAAI,CAAC;IAC9B,kEAAkE;IAClE,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,4DAA4D;IAC5D,SAAS,EAAE,MAAM,CAAC;IAClB,qEAAqE;IACrE,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,UAAU;IACzB,gEAAgE;IAChE,cAAc,EAAE,MAAM,GAAG,IAAI,CAAC;IAC9B,gCAAgC;IAChC,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,oDAAoD;IACpD,KAAK,EAAE,MAAM,EAAE,CAAC;IAChB,kDAAkD;IAClD,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED;;;;;GAKG;AACH,wBAAgB,eAAe,CAAC,MAAM,EAAE,MAAM,GAAG,UAAU,CAsB1D;AA4BD;;;;GAIG;AACH,wBAAsB,SAAS,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC,CAkCvE"}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* List + parse Hover-generated Playwright specs under `<devRoot>/__vibe_tests__/`.
|
|
3
|
+
*
|
|
4
|
+
* Used by:
|
|
5
|
+
* - The widget's "Specs" overlay tab (server pushes a SpecSummary[] list).
|
|
6
|
+
* - The CLI's `hover re-record <spec>` subcommand (parses one spec for its
|
|
7
|
+
* `Original prompt:` JSDoc header).
|
|
8
|
+
*
|
|
9
|
+
* Hand-authored specs (no Hover JSDoc header) are listed but reported with
|
|
10
|
+
* `originalPrompt: null` — the UI / CLI surfaces that "this spec can't be
|
|
11
|
+
* re-recorded automatically; the natural-language intent isn't recorded."
|
|
12
|
+
*
|
|
13
|
+
* Mirrors the listSkills shape so widget UI can use the same row renderer.
|
|
14
|
+
*/
|
|
15
|
+
import { readdir, readFile } from 'node:fs/promises';
|
|
16
|
+
import { stat } from 'node:fs/promises';
|
|
17
|
+
import { join } from 'node:path';
|
|
18
|
+
/**
|
|
19
|
+
* Parse the JSDoc header that `writeSpec.ts` emits. Tolerant of:
|
|
20
|
+
* - Specs without any JSDoc (returns all-null).
|
|
21
|
+
* - Hand-edited specs where users reordered or trimmed sections.
|
|
22
|
+
* - Long prompts that wrap across lines (we take only the first line).
|
|
23
|
+
*/
|
|
24
|
+
export function parseSpecHeader(source) {
|
|
25
|
+
// JSDoc block right after the @playwright/test import (or at file top).
|
|
26
|
+
// We don't require it to be the very first JSDoc — there could be a
|
|
27
|
+
// banner comment from a linter. We DO require it to appear before the
|
|
28
|
+
// first `test(` / `test.describe(` so that long file footers can't
|
|
29
|
+
// confuse the parser.
|
|
30
|
+
const beforeFirstTest = source.split(/^\s*(?:test|test\.describe)\s*\(/m)[0] ?? source;
|
|
31
|
+
const blockMatch = beforeFirstTest.match(/\/\*\*([\s\S]*?)\*\//);
|
|
32
|
+
if (!blockMatch) {
|
|
33
|
+
return { originalPrompt: null, outcome: null, steps: [], expected: [] };
|
|
34
|
+
}
|
|
35
|
+
const block = blockMatch[1];
|
|
36
|
+
const originalPrompt = extractScalar(block, /^\s*\*\s*Original prompt:\s*(.+?)\s*$/m);
|
|
37
|
+
const outcome = extractScalar(block, /^\s*\*\s*Outcome:\s*(.+?)\s*$/m);
|
|
38
|
+
return {
|
|
39
|
+
originalPrompt,
|
|
40
|
+
outcome,
|
|
41
|
+
steps: extractList(block, /^\s*\*\s*Steps:\s*$/m),
|
|
42
|
+
expected: extractList(block, /^\s*\*\s*Expected:\s*$/m),
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
function extractScalar(block, re) {
|
|
46
|
+
const m = block.match(re);
|
|
47
|
+
return m ? m[1].trim() : null;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Extract a JSDoc list-style block. Given a header regex matching "Steps:"
|
|
51
|
+
* or "Expected:", read subsequent ` * <indented line>` lines until the next
|
|
52
|
+
* top-level marker (blank ` *` line or another `Section:` header).
|
|
53
|
+
*/
|
|
54
|
+
function extractList(block, headerRe) {
|
|
55
|
+
const match = block.match(headerRe);
|
|
56
|
+
if (!match)
|
|
57
|
+
return [];
|
|
58
|
+
const start = (match.index ?? 0) + match[0].length;
|
|
59
|
+
const tail = block.slice(start);
|
|
60
|
+
const lines = [];
|
|
61
|
+
for (const raw of tail.split('\n')) {
|
|
62
|
+
// Stop at a blank JSDoc line (` *` only) or another `Section:` header.
|
|
63
|
+
if (/^\s*\*\s*$/.test(raw))
|
|
64
|
+
break;
|
|
65
|
+
if (/^\s*\*\s*\w[\w ]*:\s*$/.test(raw) || /^\s*\*\s*\w[\w ]*:\s/.test(raw))
|
|
66
|
+
break;
|
|
67
|
+
const m = raw.match(/^\s*\*\s*(?:[•\-\*\d.]\s*)*(.+?)\s*$/);
|
|
68
|
+
if (m && m[1])
|
|
69
|
+
lines.push(m[1]);
|
|
70
|
+
}
|
|
71
|
+
return lines;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* List every `*.spec.ts` file under `<devRoot>/__vibe_tests__/` with its
|
|
75
|
+
* parsed header. Returns newest-first by mtime so the widget overlay shows
|
|
76
|
+
* recently-saved specs at the top.
|
|
77
|
+
*/
|
|
78
|
+
export async function listSpecs(devRoot) {
|
|
79
|
+
const root = join(devRoot, '__vibe_tests__');
|
|
80
|
+
let entries;
|
|
81
|
+
try {
|
|
82
|
+
entries = await readdir(root);
|
|
83
|
+
}
|
|
84
|
+
catch {
|
|
85
|
+
return [];
|
|
86
|
+
}
|
|
87
|
+
const summaries = [];
|
|
88
|
+
for (const entry of entries) {
|
|
89
|
+
if (!entry.endsWith('.spec.ts'))
|
|
90
|
+
continue;
|
|
91
|
+
const path = join(root, entry);
|
|
92
|
+
let content;
|
|
93
|
+
let mtimeMs = 0;
|
|
94
|
+
try {
|
|
95
|
+
content = await readFile(path, 'utf-8');
|
|
96
|
+
const st = await stat(path);
|
|
97
|
+
mtimeMs = st.mtimeMs;
|
|
98
|
+
}
|
|
99
|
+
catch {
|
|
100
|
+
continue;
|
|
101
|
+
}
|
|
102
|
+
const header = parseSpecHeader(content);
|
|
103
|
+
summaries.push({
|
|
104
|
+
slug: entry.replace(/\.spec\.ts$/, ''),
|
|
105
|
+
path,
|
|
106
|
+
originalPrompt: header.originalPrompt,
|
|
107
|
+
outcome: header.outcome,
|
|
108
|
+
stepCount: header.steps.length,
|
|
109
|
+
mtimeMs,
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
summaries.sort((a, b) => b.mtimeMs - a.mtimeMs);
|
|
113
|
+
return summaries;
|
|
114
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@hover-dev/core",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.11.0",
|
|
4
4
|
"description": "Hover's local Node service: agent invocation, Playwright CDP preflight, WebSocket bridge.",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"author": "Hyperyond",
|
|
@@ -75,6 +75,7 @@
|
|
|
75
75
|
"verify-spec": "tsx src/scripts/verify-spec.ts",
|
|
76
76
|
"ws-smoke": "tsx src/scripts/ws-smoke.ts",
|
|
77
77
|
"bench-ttfb": "tsx src/scripts/bench-ttfb.ts",
|
|
78
|
+
"bench-multi-tab": "tsx src/scripts/bench-multi-tab.ts",
|
|
78
79
|
"test": "vitest run",
|
|
79
80
|
"test:watch": "vitest"
|
|
80
81
|
},
|