@hover-dev/core 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. package/README.md +26 -55
  2. package/dist/agentDirectives.d.ts +55 -0
  3. package/dist/agentDirectives.d.ts.map +1 -0
  4. package/dist/agentDirectives.js +276 -0
  5. package/dist/agents/claude.d.ts.map +1 -1
  6. package/dist/agents/claude.js +28 -3
  7. package/dist/agents/codex.d.ts.map +1 -1
  8. package/dist/agents/codex.js +29 -14
  9. package/dist/agents/invoke.d.ts.map +1 -1
  10. package/dist/agents/invoke.js +3 -6
  11. package/dist/agents/registry.d.ts.map +1 -1
  12. package/dist/agents/registry.js +0 -4
  13. package/dist/agents/types.d.ts +19 -11
  14. package/dist/agents/types.d.ts.map +1 -1
  15. package/dist/engine.d.ts +53 -0
  16. package/dist/engine.d.ts.map +1 -0
  17. package/dist/engine.js +78 -0
  18. package/dist/mcp/actuateServer.d.ts +3 -0
  19. package/dist/mcp/actuateServer.d.ts.map +1 -0
  20. package/dist/mcp/actuateServer.js +594 -0
  21. package/dist/mcp/sourceFence.d.ts.map +1 -1
  22. package/dist/mcp/sourceFence.js +4 -0
  23. package/dist/mcp/sourceServer.js +75 -0
  24. package/dist/memory/businessMemory.d.ts +29 -0
  25. package/dist/memory/businessMemory.d.ts.map +1 -0
  26. package/dist/memory/businessMemory.js +125 -0
  27. package/dist/modes.d.ts +39 -0
  28. package/dist/modes.d.ts.map +1 -0
  29. package/dist/modes.js +34 -0
  30. package/dist/playwright/cdpStatus.d.ts +0 -15
  31. package/dist/playwright/cdpStatus.d.ts.map +1 -1
  32. package/dist/playwright/cdpStatus.js +0 -67
  33. package/dist/playwright/launchChrome.d.ts +18 -0
  34. package/dist/playwright/launchChrome.d.ts.map +1 -1
  35. package/dist/playwright/launchChrome.js +46 -3
  36. package/dist/playwright/resolveMcpConfig.d.ts +7 -1
  37. package/dist/playwright/resolveMcpConfig.d.ts.map +1 -1
  38. package/dist/playwright/resolveMcpConfig.js +22 -4
  39. package/dist/plugin-api.d.ts +28 -26
  40. package/dist/plugin-api.d.ts.map +1 -1
  41. package/dist/plugin-api.js +2 -2
  42. package/dist/qa/candidates.d.ts +32 -0
  43. package/dist/qa/candidates.d.ts.map +1 -0
  44. package/dist/qa/candidates.js +20 -0
  45. package/dist/qa/classify.d.ts +38 -0
  46. package/dist/qa/classify.d.ts.map +1 -0
  47. package/dist/qa/classify.js +138 -0
  48. package/dist/qa/intensity.d.ts +33 -0
  49. package/dist/qa/intensity.d.ts.map +1 -0
  50. package/dist/qa/intensity.js +25 -0
  51. package/dist/qa/qaReport.d.ts +19 -0
  52. package/dist/qa/qaReport.d.ts.map +1 -0
  53. package/dist/qa/qaReport.js +50 -0
  54. package/dist/runSession.d.ts +14 -3
  55. package/dist/runSession.d.ts.map +1 -1
  56. package/dist/runSession.js +26 -11
  57. package/dist/service/cdpHandlers.d.ts +1 -21
  58. package/dist/service/cdpHandlers.d.ts.map +1 -1
  59. package/dist/service/cdpHandlers.js +4 -39
  60. package/dist/service/cdpHint.d.ts +21 -28
  61. package/dist/service/cdpHint.d.ts.map +1 -1
  62. package/dist/service/cdpHint.js +106 -164
  63. package/dist/service/relayHandlers.d.ts +28 -0
  64. package/dist/service/relayHandlers.d.ts.map +1 -0
  65. package/dist/service/relayHandlers.js +105 -0
  66. package/dist/service/saveHandlers.d.ts +1 -3
  67. package/dist/service/saveHandlers.d.ts.map +1 -1
  68. package/dist/service/saveHandlers.js +17 -15
  69. package/dist/service/types.d.ts +108 -8
  70. package/dist/service/types.d.ts.map +1 -1
  71. package/dist/service.d.ts +7 -3
  72. package/dist/service.d.ts.map +1 -1
  73. package/dist/service.js +907 -200
  74. package/dist/sessions/sessions.d.ts +125 -0
  75. package/dist/sessions/sessions.d.ts.map +1 -0
  76. package/dist/sessions/sessions.js +175 -0
  77. package/dist/specs/authFixture.d.ts +30 -0
  78. package/dist/specs/authFixture.d.ts.map +1 -0
  79. package/dist/specs/authFixture.js +145 -0
  80. package/dist/specs/businessMap.d.ts +29 -0
  81. package/dist/specs/businessMap.d.ts.map +1 -0
  82. package/dist/specs/businessMap.js +95 -0
  83. package/dist/specs/detectSharedFlows.d.ts +1 -1
  84. package/dist/specs/detectSharedFlows.d.ts.map +1 -1
  85. package/dist/specs/detectSharedFlows.js +20 -21
  86. package/dist/specs/generatePageObject.d.ts +1 -1
  87. package/dist/specs/generatePageObject.d.ts.map +1 -1
  88. package/dist/specs/healPrompt.d.ts +19 -0
  89. package/dist/specs/healPrompt.d.ts.map +1 -0
  90. package/dist/specs/healPrompt.js +48 -0
  91. package/dist/specs/humanSteps.d.ts +4 -8
  92. package/dist/specs/humanSteps.d.ts.map +1 -1
  93. package/dist/specs/humanSteps.js +6 -1
  94. package/dist/specs/optimizeSpec.d.ts +15 -8
  95. package/dist/specs/optimizeSpec.d.ts.map +1 -1
  96. package/dist/specs/optimizeSpec.js +71 -41
  97. package/dist/specs/optimizeSpecWithAgent.d.ts +0 -2
  98. package/dist/specs/optimizeSpecWithAgent.d.ts.map +1 -1
  99. package/dist/specs/optimizeSpecWithAgent.js +0 -1
  100. package/dist/specs/pageObjectManifest.d.ts +3 -1
  101. package/dist/specs/pageObjectManifest.d.ts.map +1 -1
  102. package/dist/specs/pageObjectManifest.js +13 -9
  103. package/dist/specs/replayGrounded.d.ts +45 -0
  104. package/dist/specs/replayGrounded.d.ts.map +1 -0
  105. package/dist/specs/replayGrounded.js +155 -0
  106. package/dist/specs/runFailures.d.ts +34 -0
  107. package/dist/specs/runFailures.d.ts.map +1 -0
  108. package/dist/specs/runFailures.js +93 -0
  109. package/dist/specs/seeds.d.ts +16 -15
  110. package/dist/specs/seeds.d.ts.map +1 -1
  111. package/dist/specs/seeds.js +86 -54
  112. package/dist/specs/sidecar.d.ts +34 -6
  113. package/dist/specs/sidecar.d.ts.map +1 -1
  114. package/dist/specs/sidecar.js +79 -9
  115. package/dist/specs/specStep.d.ts +21 -0
  116. package/dist/specs/specStep.d.ts.map +1 -0
  117. package/dist/specs/specStep.js +1 -0
  118. package/dist/specs/text.d.ts +8 -6
  119. package/dist/specs/text.d.ts.map +1 -1
  120. package/dist/specs/text.js +10 -7
  121. package/dist/specs/writeSpec.d.ts +62 -1
  122. package/dist/specs/writeSpec.d.ts.map +1 -1
  123. package/dist/specs/writeSpec.js +596 -21
  124. package/package.json +6 -9
  125. package/dist/agents/aider.d.ts +0 -16
  126. package/dist/agents/aider.d.ts.map +0 -1
  127. package/dist/agents/aider.js +0 -161
  128. package/dist/agents/cursor.d.ts +0 -18
  129. package/dist/agents/cursor.d.ts.map +0 -1
  130. package/dist/agents/cursor.js +0 -220
  131. package/dist/playwright/raiseWindow.d.ts +0 -10
  132. package/dist/playwright/raiseWindow.d.ts.map +0 -1
  133. package/dist/playwright/raiseWindow.js +0 -158
  134. package/dist/scripts/bench-multi-tab.d.ts +0 -2
  135. package/dist/scripts/bench-multi-tab.d.ts.map +0 -1
  136. package/dist/scripts/bench-multi-tab.js +0 -192
  137. package/dist/scripts/bench-ttfb.d.ts +0 -2
  138. package/dist/scripts/bench-ttfb.d.ts.map +0 -1
  139. package/dist/scripts/bench-ttfb.js +0 -127
  140. package/dist/scripts/start-chrome.d.ts +0 -3
  141. package/dist/scripts/start-chrome.d.ts.map +0 -1
  142. package/dist/scripts/start-chrome.js +0 -23
  143. package/dist/skills/writeSkill.d.ts +0 -27
  144. package/dist/skills/writeSkill.d.ts.map +0 -1
  145. package/dist/skills/writeSkill.js +0 -13
  146. package/dist/specs/listSpecs.d.ts +0 -52
  147. package/dist/specs/listSpecs.d.ts.map +0 -1
  148. package/dist/specs/listSpecs.js +0 -139
  149. package/dist/specs/optimizationSuggestion.d.ts +0 -26
  150. package/dist/specs/optimizationSuggestion.d.ts.map +0 -1
  151. package/dist/specs/optimizationSuggestion.js +0 -28
  152. package/dist/specs/writeCaseCsv.d.ts +0 -28
  153. package/dist/specs/writeCaseCsv.d.ts.map +0 -1
  154. package/dist/specs/writeCaseCsv.js +0 -134
package/dist/service.js CHANGED
@@ -10,82 +10,133 @@
10
10
  * { type: 'hello', payload: { agentId, model, version } }
11
11
  * { type: 'event', payload: InvokeEvent } // see agents/types.ts
12
12
  * { type: 'cdp-status', payload: { state, reason?, matchingTabUrl?, browser?, launching? } }
13
- * { type: 'specs-list', payload: { specs: SpecSummary[] } }
14
- * { type: 'seeds-list', payload: { seeds: { name, note, signature, code, source }[] } }
15
13
  * { type: 'spec-saved', payload: { name, path } }
16
14
  * { type: 'spec-exists', payload: { slug, existingPath } }
17
- * { type: 'case-csv-saved', payload: { name, path } }
18
- * { type: 'case-csv-exists', payload: { slug, existingPath } }
19
15
  * { type: 'error', payload: { message } }
20
16
  *
21
17
  * client → server
22
- * { type: 'command', payload: { text, sessionId?, reRecord?: { slug } } }
23
- * // when reRecord.slug is set, the
24
- * // service collects tool_use events
25
- * // into a step list and on a clean
26
- * // session_end overwrites
27
- * // __vibe_tests__/<slug>.spec.ts
18
+ * { type: 'command', payload: { text, sessionId? } }
28
19
  * { type: 'cancel' }
29
- * { type: 'check-cdp', payload: { pageUrl } } // "is this widget in the debug Chrome?"
30
20
  * { type: 'launch-chrome', payload: { pageUrl } } // start debug Chrome, navigate to pageUrl
31
- * { type: 'focus-debug', payload: { pageUrl } } // bringToFront the matching tab in debug Chrome
32
21
  * { type: 'save-spec', payload: { name, description, steps, assertions?, overwrite? } }
33
- * { type: 'save-case-csv', payload: { name, description, steps, assertions?, jiraProjectKey?, labels?, overwrite? } }
34
- * { type: 'list-specs' } // ask for every spec under __vibe_tests__/, with parsed JSDoc headers
35
- * { type: 'list-seeds' } // ask for built-in + .hover/rules/ translation seeds (read-only)
36
- * { type: 'list-agents' } // ask for the full agent registry + install status
37
22
  * { type: 'switch-agent', payload: { agentId } } // set the service's current agent; broadcasts to all connections
23
+ * { type: 'reveal-source', payload: { source } } // relay a data-hover-source value to other clients (F2 page→editor)
38
24
  *
39
25
  * server → client (in addition to those documented in the file body):
26
+ * { type: 'reveal-source', payload: { source } } // relayed to non-origin clients (the VSCode ext jumps the editor)
40
27
  * { type: 'agents', payload: { current: string, available: AgentAvailability[] } }
41
28
  * { type: 'modes', payload: { current: string|null, available: ModeEntry[] } }
42
29
  * { type: '<plugin-namespaced>', payload: <plugin-specific> }
43
30
  *
44
31
  * client → server (plugin-aware additions):
45
32
  * { type: 'set-mode', payload: { modeId: string|null } } // null = exit moded operation
46
- * { type: 'list-modes' }
47
33
  */
48
34
  import { WebSocketServer, WebSocket } from 'ws';
49
35
  import { fileURLToPath } from 'node:url';
50
- import { dirname, resolve } from 'node:path';
36
+ import { dirname, join, resolve } from 'node:path';
37
+ import { runDir } from './specs/sidecar.js';
38
+ import { readdirSync, statSync, mkdirSync, readFileSync } from 'node:fs';
39
+ import { tmpdir } from 'node:os';
51
40
  import { runSession } from './runSession.js';
52
41
  import { readConventions } from './service/conventions.js';
53
42
  import { optimizeSpecWithAgent } from './specs/optimizeSpecWithAgent.js';
54
- import { promoteOptimized, discardOptimized } from './specs/optimizeSpec.js';
43
+ import { parseRunFailures } from './specs/runFailures.js';
44
+ import { buildHealPrompt, healLabel } from './specs/healPrompt.js';
55
45
  import { listAgentAvailability, pickPrimaryAgent, } from './agents/detect.js';
56
46
  import { getAgent } from './agents/registry.js';
57
47
  import { getPreflight, invalidatePreflight } from './playwright/preflightCache.js';
58
48
  import { resolveMcpConfig, mcpToolPrefix } from './playwright/resolveMcpConfig.js';
59
- import { launchDebugChrome } from './playwright/launchChrome.js';
60
- import { listSpecs } from './specs/listSpecs.js';
61
- import { readSeeds, BUILTIN_SEEDS } from './specs/seeds.js';
49
+ import { launchDebugChrome, closeDebugChrome } from './playwright/launchChrome.js';
50
+ import { writeSessionRecord, parseFindings, tallyTools } from './sessions/sessions.js';
51
+ import { resolveModeBehavior, isBuiltinMode, BUILTIN_MODES } from './modes.js';
52
+ import { CJK_RE, ZH_OUTPUT_DIRECTIVE, GROUNDED_ACTUATION_DENY, REPORTING_DIRECTIVE, NARRATION_DIRECTIVE, ASK_FORMAT_DIRECTIVE, EXPLORATION_CHECKPOINT_DIRECTIVE, GROUNDED_ACTUATION_DIRECTIVE, RECON_DIRECTIVE, QA_EXPLORATION_DIRECTIVE, QA_VERIFY_DEFER_SECURITY_DIRECTIVE, } from './agentDirectives.js';
53
+ import { loadMemory, formatMemoryForPrompt, writeFact } from './memory/businessMemory.js';
54
+ import { writeQaReport } from './qa/qaReport.js';
55
+ import { finalizeCandidates } from './qa/candidates.js';
56
+ import { QA_INTENSITY, asQaIntensity, qaBudgetDirective } from './qa/intensity.js';
57
+ import { classifyInstruction } from './qa/classify.js';
62
58
  import { send, sendIfOpen } from './service/types.js';
59
+ import { handleRelayMessage } from './service/relayHandlers.js';
63
60
  import { buildCdpHint, buildCdpHintResume } from './service/cdpHint.js';
64
- import { handleCheckCdp, handleLaunchChrome, handleFocusDebug, } from './service/cdpHandlers.js';
65
- import { handleSaveArtifact, SPEC_CONFIG, CASE_CSV_CONFIG, } from './service/saveHandlers.js';
61
+ import { handleLaunchChrome, } from './service/cdpHandlers.js';
62
+ import { handleSaveArtifact, SPEC_CONFIG, } from './service/saveHandlers.js';
66
63
  import { CURRENT_API_VERSION, } from './plugin-api.js';
67
- /** The source-reader MCP server (codeContext). Id the `mcp__hover_source`
64
+ /** Tools whose steps crystallize to a replayable line (grounded actuations +
65
+ * navigation). Used to build a FALLBACK QA candidate from a completed run when
66
+ * the agent never called record_candidate — so crystallization doesn't depend
67
+ * on the agent's compliance. Structural typing avoids a SkillStep import. */
68
+ const CRYSTALLIZABLE_TOOLS = new Set([
69
+ 'click_control', 'fill_control', 'select_control', 'check_control',
70
+ 'upload_file', 'assert_visible', 'browser_navigate',
71
+ ]);
72
+ function bareToolName(tool) {
73
+ return tool.replace(/^mcp__[a-z0-9_-]+?__/, '');
74
+ }
75
+ function isCrystallizableStep(s) {
76
+ return s.kind === 'step' && !!s.tool && !s.isError && CRYSTALLIZABLE_TOOLS.has(bareToolName(s.tool));
77
+ }
78
+ /** A real interaction (not just navigation) — so a fallback candidate isn't a
79
+ * lone goto with nothing to replay. */
80
+ function isRealAction(s) {
81
+ return !!s.tool && bareToolName(s.tool) !== 'browser_navigate';
82
+ }
83
+ /** The source-reader MCP server (codeContext). Id → the `mcp__hoversource`
68
84
  * tool prefix; script path resolved relative to this module so it works from
69
85
  * dist/. Spawned only when codeContext is enabled. */
70
- const SOURCE_MCP_ID = 'hover-source';
86
+ const SOURCE_MCP_ID = 'hoversource'; // no hyphen — see CONTROL_MCP_ID note below
71
87
  const SOURCE_MCP_SCRIPT = resolve(dirname(fileURLToPath(import.meta.url)), 'mcp', 'sourceServer.js');
88
+ /** The control-actuation MCP server (always on) — force-toggles sr-only hidden
89
+ * radios/checkboxes the locked-down Playwright `browser_click` can't actuate. */
90
+ // NOTE: no hyphen. Claude forms MCP tool names as `mcp__<config-id>__<tool>`
91
+ // keeping the id verbatim, but our allow-list prefix sanitizes non-alphanumerics
92
+ // to `_` (mcpToolPrefix). A hyphenated id ('hover-control') yields allow
93
+ // `mcp__hover_control` which does NOT prefix-match the tool `mcp__hover-control__*`,
94
+ // so every actuation call gets denied by the hard sandbox. Keep it alphanumeric.
95
+ const CONTROL_MCP_ID = 'hovercontrol';
96
+ const CONTROL_MCP_SCRIPT = resolve(dirname(fileURLToPath(import.meta.url)), 'mcp', 'actuateServer.js');
72
97
  // ClientMessage + send moved to ./service/types.ts so the cdp + save
73
98
  // handler modules can share them. See those files for the wire shape.
74
99
  const PROTOCOL_VERSION = 1;
75
100
  const PORT_RETRIES = 10;
76
- /** CJK-presence test mirrors voice.js's detectLanguage. Any Han character
77
- * in the prompt flips the agent's prose output to Chinese. */
78
- const CJK_RE = /[一-鿿]/;
79
- /** Appended to the agent's system prompt when the user's prompt contains CJK,
80
- * so the human-facing prose (verification summary / ## Findings / step
81
- * narration) comes back in Chinese matching how Voice mode picks a Chinese
82
- * TTS voice for the same prompt. Deliberately scoped to PROSE only: the agent
83
- * must still use the page's real (often English) accessible names, labels,
84
- * and selectors when driving the browser. */
85
- const ZH_OUTPUT_DIRECTIVE = '用户使用中文下达指令。请用简体中文撰写所有面向用户的文字输出:验证结论摘要、' +
86
- '`## Findings` 区块(bug / 问题 / 备注)、以及每一步的中文描述。' +
87
- '注意:这只影响你写给用户看的文字。操作浏览器时仍要使用页面真实的(通常是英文的)' +
88
- '角色名、标签、可访问名称和选择器——不要把它们翻译成中文。';
101
+ /** An isolated, empty cwd for the agent when the user picks "Isolated" memory.
102
+ * `claude` keys its auto-memory by the absolute cwd path and discovers CLAUDE.md
103
+ * by walking up from cwd — so running in a throwaway temp dir (no .git / no
104
+ * ancestor CLAUDE.md) loads NONE of the user's project memory or CLAUDE.md,
105
+ * while their ~/.claude credentials (OAuth) stay intact. The default ("shared")
106
+ * keeps cwd = devRoot so the agent gets the project's context. */
107
+ function isolatedAgentCwd() {
108
+ const dir = resolve(tmpdir(), 'hover-agent-cwd');
109
+ try {
110
+ mkdirSync(dir, { recursive: true });
111
+ }
112
+ catch { /* best-effort */ }
113
+ return dir;
114
+ }
115
+ /** The most-recently-written `.png` in a directory (by mtime), or null. Used to
116
+ * resolve which screenshot a `browser_take_screenshot` just produced — the
117
+ * agent often lets the MCP auto-name the file, so the name isn't in the tool
118
+ * input; the freshest png in the run's output dir is it. Best-effort: never
119
+ * throws (a missing dir / race just yields null). */
120
+ function newestPng(dir) {
121
+ try {
122
+ let best = null;
123
+ let bestMtime = -1;
124
+ for (const f of readdirSync(dir)) {
125
+ if (!f.toLowerCase().endsWith('.png'))
126
+ continue;
127
+ const p = resolve(dir, f);
128
+ const mtime = statSync(p).mtimeMs;
129
+ if (mtime > bestMtime) {
130
+ bestMtime = mtime;
131
+ best = p;
132
+ }
133
+ }
134
+ return best;
135
+ }
136
+ catch {
137
+ return null;
138
+ }
139
+ }
89
140
  /**
90
141
  * Try to bind a WebSocketServer to <host>:<port>. Resolves with the wss on
91
142
  * success; rejects with the bind error (typically EADDRINUSE) on failure.
@@ -136,11 +187,6 @@ export async function startService(opts) {
136
187
  const preferred = opts.agentId ?? process.env.HOVER_AGENT;
137
188
  const primary = await pickPrimaryAgent(preferred);
138
189
  let currentAgentId = primary?.descriptor.id ?? preferred ?? 'claude';
139
- // Optional model API key the widget supplied (set-api-key). Held in memory
140
- // for this service's lifetime only — never written to disk, never logged.
141
- // Injected into the spawned CLI's env so a user without a logged-in
142
- // subscription can drive Hover on their own key.
143
- let currentApiKey = process.env.ANTHROPIC_API_KEY ?? process.env.OPENAI_API_KEY ?? undefined;
144
190
  if (!primary) {
145
191
  // Nothing installed — still bind so the widget can show a helpful
146
192
  // "install one of these" dialog. Commands will fail with
@@ -152,15 +198,56 @@ export async function startService(opts) {
152
198
  else if (preferred && preferred !== primary.descriptor.id) {
153
199
  process.stderr.write(`[hover] requested agent "${preferred}" is not installed; falling back to "${primary.descriptor.id}".\n`);
154
200
  }
155
- const model = opts.model ?? 'sonnet';
201
+ let model = opts.model ?? 'sonnet';
202
+ // Reasoning-effort level for runs (set via set-effort; undefined = agent/model
203
+ // default). Threaded into invokeAgent alongside model.
204
+ let currentEffort = opts.effort;
205
+ // Local LLM endpoint (set via set-local-endpoint): when the qwen agent is
206
+ // active, this OpenAI-compatible base URL is injected so qwen drives the
207
+ // user's self-hosted model instead of a hosted one.
208
+ let currentLocalBaseUrl;
209
+ // BYOK (set via set-byok): when present, runs are driven by the protocol's
210
+ // matching CLI with the user's key + base URL + model injected via env,
211
+ // instead of the local-CLI agent's own logged-in auth. null = use the CLI.
212
+ let currentByok = null;
213
+ // Protocol → CLI: Anthropic drives claude (hard sandbox), Gemini drives the
214
+ // gemini CLI, OpenAI / Azure / OpenAI-compatible gateways drive codex.
215
+ const byokAgentFor = (protocol) => protocol === 'anthropic' ? 'claude' : protocol === 'gemini' ? 'gemini' : 'codex';
216
+ // Protocol → auth env vars the matching CLI reads. Only set what's provided
217
+ // so an empty base URL leaves the CLI on its own default endpoint.
218
+ const byokEnvFor = (b) => {
219
+ const env = {};
220
+ if (b.protocol === 'anthropic') {
221
+ if (b.apiKey)
222
+ env.ANTHROPIC_API_KEY = b.apiKey;
223
+ if (b.baseUrl)
224
+ env.ANTHROPIC_BASE_URL = b.baseUrl;
225
+ }
226
+ else if (b.protocol === 'gemini') {
227
+ if (b.apiKey) {
228
+ env.GEMINI_API_KEY = b.apiKey;
229
+ env.GOOGLE_API_KEY = b.apiKey;
230
+ }
231
+ if (b.baseUrl)
232
+ env.GOOGLE_GEMINI_BASE_URL = b.baseUrl;
233
+ }
234
+ else {
235
+ // openai / azure / gateways — OpenAI-compatible, driven via codex.
236
+ if (b.apiKey)
237
+ env.OPENAI_API_KEY = b.apiKey;
238
+ if (b.baseUrl)
239
+ env.OPENAI_BASE_URL = b.baseUrl;
240
+ }
241
+ return env;
242
+ };
156
243
  // No default budget cap — long real-world flows (form filling, multi-step
157
244
  // checkouts) routinely run past the old $0.50 ceiling and got cut off
158
245
  // mid-run. The widget shows the running $ counter in the header instead,
159
246
  // so the user can hit Stop when they've seen enough. Pass maxBudgetUsd
160
247
  // explicitly (or via the Vite plugin option) if a hard ceiling is needed.
161
248
  const maxBudgetUsd = opts.maxBudgetUsd;
162
- const optimizeMode = opts.optimizeMode ?? 'suggest';
163
249
  const cdpUrl = opts.cdpUrl ?? 'http://localhost:9222';
250
+ const userDataDir = opts.userDataDir;
164
251
  const devRoot = opts.devRoot ?? process.cwd();
165
252
  const wss = await pickAndBind('127.0.0.1', requestedPort, PORT_RETRIES);
166
253
  const port = wss.address().port;
@@ -170,7 +257,7 @@ export async function startService(opts) {
170
257
  // forced an explicit one, but in that case mode-contributed servers
171
258
  // are silently dropped — we log a warning the first time it happens.
172
259
  let warnedExplicitMcpOverride = false;
173
- const buildMcpConfig = () => {
260
+ const buildMcpConfig = (shotDir, sourceGate = 'ask') => {
174
261
  if (opts.mcpConfig) {
175
262
  const activePlugin = currentModeId ? pluginsByModeId.get(currentModeId) : null;
176
263
  if (activePlugin?.mcpServers?.length && !warnedExplicitMcpOverride) {
@@ -187,7 +274,7 @@ export async function startService(opts) {
187
274
  for (const p of plugins) {
188
275
  for (const srv of p.mcpServers ?? []) {
189
276
  const scope = srv.activeInModes ?? (p.mode ? [p.mode.id] : []);
190
- const inMode = scope.includes('*') || scope.includes(currentModeId);
277
+ const inMode = scope.includes('*') || scope.includes(currentModeId) || apiScopeOk(scope) || pentestScopeOk(scope);
191
278
  if (!inMode)
192
279
  continue;
193
280
  extra.push({
@@ -203,14 +290,40 @@ export async function startService(opts) {
203
290
  }
204
291
  }
205
292
  // codeContext (opt-in, all modes): the fenced read-only source reader.
206
- if (opts.codeContext) {
293
+ // 'deny' drops it entirely; 'ask' makes it gate each read through the editor
294
+ // (HOVER_APPROVAL_PORT); 'always' lets it read without asking.
295
+ if (opts.codeContext && sourceGate !== 'deny') {
207
296
  extra.push({
208
297
  id: SOURCE_MCP_ID,
209
298
  command: process.execPath,
210
299
  args: [SOURCE_MCP_SCRIPT],
211
- env: { HOVER_PROJECT_ROOT: devRoot },
300
+ env: {
301
+ HOVER_PROJECT_ROOT: devRoot,
302
+ HOVER_SOURCE_GATE: sourceGate === 'ask' ? 'ask' : 'allow',
303
+ ...(sourceGate === 'ask' ? { HOVER_APPROVAL_PORT: String(port) } : {}),
304
+ },
212
305
  });
213
306
  }
307
+ // Control actuation (always on, all modes): force-toggles sr-only hidden
308
+ // radios/checkboxes the locked-down Playwright click can't actuate. Drives
309
+ // the same debug Chrome over CDP; crystallizes to a normal .check() step.
310
+ extra.push({
311
+ id: CONTROL_MCP_ID,
312
+ command: process.execPath,
313
+ args: [CONTROL_MCP_SCRIPT],
314
+ // HOVER_APPROVAL_PORT: the control MCP's ask_user tool reaches the editor
315
+ // over the service WS. HOVER_PROJECT_ROOT: where upload_file writes its
316
+ // placeholder fixture and resolves relative paths. HOVER_SHOT_DIR: where
317
+ // take_screenshot writes (the same per-run dir the service scans), so its
318
+ // viewport PNGs surface in the chat exactly like browser_take_screenshot's.
319
+ env: {
320
+ HOVER_CDP_URL: cdpUrl,
321
+ HOVER_DEV_URL: opts.devUrl ?? cdpUrl,
322
+ HOVER_APPROVAL_PORT: String(port),
323
+ HOVER_PROJECT_ROOT: devRoot,
324
+ ...(shotDir ? { HOVER_SHOT_DIR: shotDir } : {}),
325
+ },
326
+ });
214
327
  // Single-Chrome model: the Playwright MCP always points at the one debug
215
328
  // Chrome on the normal cdpUrl. (Pre-single-Chrome this branched to a
216
329
  // mode-specific port like 9333; there's no second Chrome anymore.)
@@ -221,6 +334,10 @@ export async function startService(opts) {
221
334
  // Suffix the filename by the mode so different mode toggles within
222
335
  // one service produce distinct config files (debugging aid).
223
336
  suffix: currentModeId ?? undefined,
337
+ // Screenshots / traces land in the run's own folder
338
+ // (.hover/runs/<conv>/<runId>/screenshots), grouped per run, instead of
339
+ // the MCP server's default OS temp dir.
340
+ outputDir: shotDir,
224
341
  });
225
342
  };
226
343
  // Surface post-listen errors instead of crashing the host process.
@@ -268,6 +385,21 @@ export async function startService(opts) {
268
385
  */
269
386
  const RECONNECT_GRACE_MS = 15_000;
270
387
  let activeRun = null;
388
+ /** QA candidate flows recorded by the agent this run (via record_candidate).
389
+ * Buffered here (connection scope, visible to both the message handler and
390
+ * the run lifecycle); reset at each run start; resolved to real steps and
391
+ * emitted as `qa-candidates` at run end. */
392
+ let runCandidates = [];
393
+ /** Reset recipe discovered by recon this run (via record_reset_recipe). Buffered
394
+ * here, forwarded to the extension's env store (.hover/environments.json, which
395
+ * the extension owns) at run end, keyed to the run's env. */
396
+ let runResetRecipe = null;
397
+ /** In-flight source-read approval requests: correlation id → the source-MCP
398
+ * socket that asked, so the editor's response can be routed back to it. */
399
+ const pendingApprovals = new Map();
400
+ /** In-flight ask_user prompts: correlation id → the control-MCP socket that
401
+ * asked, so the editor's answer routes back to the waiting agent. */
402
+ const pendingAsks = new Map();
271
403
  /** Send a run event to whichever ws is currently attached (survives reconnect). */
272
404
  const emitToRun = (msg) => {
273
405
  const c = activeRun?.client;
@@ -278,7 +410,7 @@ export async function startService(opts) {
278
410
  * (security's resident MITM). RESIDENT for the whole session — set once
279
411
  * before Chrome launches, never cleared on mode change — so the single
280
412
  * debug Chrome is born with `--proxy-server` + the SPKI pin and entering
281
- * Security mode is just a runtime flip of the proxy, not a Chrome relaunch.
413
+ * API-testing mode is just a runtime flip of the proxy, not a Chrome relaunch.
282
414
  * Read by `effectiveLaunchExtras()` and threaded into every cdp handler
283
415
  * (check-cdp / launch-chrome / focus-debug) plus the initial auto-launch. */
284
416
  let residentChromeProxy = null;
@@ -287,33 +419,67 @@ export async function startService(opts) {
287
419
  * Merged with the manifest-declared env when the agent's spawn-time
288
420
  * MCP config is built. */
289
421
  const mcpEnvOverrides = new Map();
422
+ // QA "API capability": QA is a built-in mode, but when its API capability is
423
+ // on it COMPOSES the api-test plugin's runtime — flips the resident MITM to
424
+ // intercept, exposes the api-test MCP tools, and adds its prompt — so the QA
425
+ // agent can inspect/replay/test the app's API calls alongside the UI flows.
426
+ const apiTestPlugin = plugins.find((p) => p.mode?.id === 'api-test') ?? null;
427
+ /** Is the API capability ACTUALLY usable? The plugin must be loaded AND its
428
+ * resident MITM proxy must be up (set at service:start). "Available" gates the
429
+ * UI toggle so a user can never turn ON something that would then fail. */
430
+ const apiCapabilityAvailable = () => !!apiTestPlugin && residentChromeProxy !== null;
431
+ /** Set per QA run when the API capability is on + available — drives the MCP
432
+ * config, the prompt, and the activate/deactivate of the resident proxy. */
433
+ let apiActiveThisRun = false;
434
+ /** A plugin's mode-scoped contribution also applies when it's the api-test
435
+ * plugin being composed into the current QA run. */
436
+ const apiScopeOk = (scope) => apiActiveThisRun && scope.includes('api-test');
437
+ // QA "Pentest capability": same composition as API, but the pentest plugin —
438
+ // offensive (attacks the OWN dev app), origin-locked, writes a findings report.
439
+ // Mutually exclusive with the API capability (the plugins conflict). Default
440
+ // OFF; the editor confirms before enabling.
441
+ const pentestPlugin = plugins.find((p) => p.mode?.id === 'pentest') ?? null;
442
+ const pentestCapabilityAvailable = () => !!pentestPlugin && residentChromeProxy !== null;
443
+ let pentestActiveThisRun = false;
444
+ const pentestScopeOk = (scope) => pentestActiveThisRun && scope.includes('pentest');
445
+ // QA two-pass: when a QA run has BOTH API + Pentest on, run two sequenced
446
+ // phases (verify first, pentest last) so the destructive pentest can't corrupt
447
+ // the verification and each phase gets a fresh, budget-bounded context. The
448
+ // verify phase runs first; this holds the queued pentest-phase command, which
449
+ // the verify run's finally re-dispatches.
450
+ let pendingPhase2 = null;
290
451
  /** The cdp-handler extras (proxy) threaded into launch-chrome / check-cdp /
291
452
  * focus-debug and the initial auto-launch. In the single-Chrome model this
292
453
  * is driven purely by the RESIDENT proxy (set in `hover:service:start`),
293
454
  * NOT by the active mode — there is one Chrome on the normal CDP port that
294
- * is always proxied; entering Security mode flips the proxy's behaviour,
455
+ * is always proxied; entering API-testing mode flips the proxy's behaviour,
295
456
  * it does not relaunch Chrome on a different port. Returns undefined when
296
457
  * no plugin set a resident proxy (the common no-security case), so plain
297
458
  * Hover is byte-for-byte unchanged. */
298
459
  const effectiveLaunchExtras = () => {
299
- if (!residentChromeProxy)
460
+ if (!residentChromeProxy && !userDataDir)
300
461
  return undefined;
301
- return { proxy: residentChromeProxy };
462
+ return {
463
+ ...(residentChromeProxy ? { proxy: residentChromeProxy } : {}),
464
+ ...(userDataDir ? { userDataDir } : {}),
465
+ };
302
466
  };
303
467
  /** Send the current mode catalogue to one ws (or all if undefined). */
304
468
  const broadcastModes = (target) => {
305
- const available = plugins
306
- .filter((p) => Boolean(p.mode))
307
- .map((p) => ({
308
- id: p.mode.id,
309
- label: p.mode.label,
310
- description: p.mode.description,
311
- // Widget retints to this while the mode is engaged (falls back to
312
- // security orange in the widget when absent).
313
- accent: p.mode.accent,
314
- pluginName: p.name,
315
- }));
316
- const payload = { current: currentModeId, available };
469
+ // The picker lists ONLY the built-in modes (Flow implicit + QA). The
470
+ // api-test / pentest PLUGINS still load — but they're surfaced as QA
471
+ // capability TOGGLES (apiCapabilityAvailable / pentestCapabilityAvailable
472
+ // below), NOT as standalone modes. (Listing plugin-contributed modes here is
473
+ // the old, removed UX: the mode picker is now Flow + QA Testing only.)
474
+ const builtins = BUILTIN_MODES.map((m) => ({ id: m.id, label: m.label, description: m.description, accent: m.accent }));
475
+ const payload = {
476
+ current: currentModeId,
477
+ available: builtins,
478
+ // Whether QA's API / Pentest capabilities can actually run (plugin loaded +
479
+ // MITM up). Gates the QA toggles so "on" always works.
480
+ apiCapabilityAvailable: apiCapabilityAvailable(),
481
+ pentestCapabilityAvailable: pentestCapabilityAvailable(),
482
+ };
317
483
  const targets = target ? [target] : [...wss.clients];
318
484
  for (const client of targets) {
319
485
  if (client.readyState === WebSocket.OPEN) {
@@ -363,6 +529,14 @@ export async function startService(opts) {
363
529
  if (newModeId) {
364
530
  const next = pluginsByModeId.get(newModeId);
365
531
  if (!next) {
532
+ // A built-in non-Flow mode (QA) is core-owned, not plugin-contributed —
533
+ // no activate hook / sidecars to run, just record it. Anything else is a
534
+ // genuinely unknown mode.
535
+ if (isBuiltinMode(newModeId)) {
536
+ currentModeId = newModeId;
537
+ broadcastModes();
538
+ return;
539
+ }
366
540
  throw new Error(`[hover] unknown modeId "${newModeId}"`);
367
541
  }
368
542
  currentModeId = newModeId;
@@ -428,7 +602,7 @@ export async function startService(opts) {
428
602
  wss.on('connection', ws => {
429
603
  send(ws, {
430
604
  type: 'hello',
431
- payload: { agentId: currentAgentId, model, version: PROTOCOL_VERSION, optimizeMode },
605
+ payload: { agentId: currentAgentId, model, version: PROTOCOL_VERSION },
432
606
  });
433
607
  // Send the agent list as a follow-up event so the widget can render the
434
608
  // dropdown immediately on connect / reconnect (e.g. after HMR). The
@@ -503,7 +677,10 @@ export async function startService(opts) {
503
677
  },
504
678
  });
505
679
  };
506
- ws.on('message', async (data) => {
680
+ // Named (not an inline arrow) so a QA run with both API + Pentest on can
681
+ // re-enter it for a sequenced second phase — see the phase split + the
682
+ // re-dispatch in the command path's finally.
683
+ const onClientMessage = async (data) => {
507
684
  let msg;
508
685
  try {
509
686
  msg = JSON.parse(data.toString());
@@ -515,8 +692,55 @@ export async function startService(opts) {
515
692
  cancel();
516
693
  return;
517
694
  }
518
- if (msg.type === 'list-modes') {
519
- broadcastModes(ws);
695
+ // Stateless relays (reveal-source / source-approval-* / ask-user-*) — see
696
+ // service/relayHandlers.ts. They route between sockets without touching the
697
+ // run's mutable state, so they live outside this closure.
698
+ if (handleRelayMessage(ws, msg, {
699
+ wss,
700
+ activeRunClient: () => activeRun?.client,
701
+ pendingApprovals,
702
+ pendingAsks,
703
+ }))
704
+ return;
705
+ // record-fact (from the control MCP's record_fact tool): persist a learned
706
+ // business rule into .hover/memory/. ONLY in QA/API modes — ignored
707
+ // elsewhere so Flow/Pentest never write business memory. Best-effort:
708
+ // a memory write must never break anything (it isn't even acked).
709
+ if (msg.type === 'record-fact') {
710
+ const f = msg.payload?.fact;
711
+ if (f && f.title && f.rule && (currentModeId === 'qa' || currentModeId === 'api-test')) {
712
+ const types = ['business-rule', 'expected-behavior', 'validation', 'access-policy'];
713
+ const type = types.includes(f.type) ? f.type : 'business-rule';
714
+ void writeFact(devRoot, { name: f.title, description: f.title, type, body: f.rule }).then((r) => {
715
+ if ('error' in r)
716
+ process.stderr.write(`[hover/qa] record-fact write failed: ${r.error}\n`);
717
+ });
718
+ }
719
+ return;
720
+ }
721
+ // record-candidate (from the control MCP's record_candidate tool): buffer a
722
+ // QA candidate flow. The MCP already captured the flow's real grounded
723
+ // steps, so we just hold them. ONLY in QA mode; emitted as `qa-candidates`
724
+ // at run end — never acked, never blocks a run.
725
+ if (msg.type === 'record-candidate') {
726
+ const c = msg.payload?.candidate;
727
+ if (c && typeof c.name === 'string' && Array.isArray(c.steps) && currentModeId === 'qa') {
728
+ runCandidates.push({
729
+ name: c.name,
730
+ description: typeof c.description === 'string' ? c.description : undefined,
731
+ steps: c.steps,
732
+ });
733
+ }
734
+ return;
735
+ }
736
+ // record-reset-recipe (from the control MCP's record_reset_recipe tool): the
737
+ // agent's state-reset classification for this app/env, discovered during
738
+ // recon. Buffer it; forwarded to the extension at run end (it owns
739
+ // .hover/environments.json), keyed to runEnv. Best-effort, never acked.
740
+ if (msg.type === 'record-reset-recipe') {
741
+ const r = msg.payload?.recipe;
742
+ if (r && typeof r.tier === 'number')
743
+ runResetRecipe = r;
520
744
  return;
521
745
  }
522
746
  if (msg.type === 'set-mode') {
@@ -535,7 +759,7 @@ export async function startService(opts) {
535
759
  });
536
760
  return;
537
761
  }
538
- if (wanted !== null && !pluginsByModeId.has(wanted)) {
762
+ if (wanted !== null && !isBuiltinMode(wanted) && !pluginsByModeId.has(wanted)) {
539
763
  send(ws, {
540
764
  type: 'error',
541
765
  payload: { message: `set-mode: unknown modeId "${wanted}"` },
@@ -555,13 +779,6 @@ export async function startService(opts) {
555
779
  }
556
780
  return;
557
781
  }
558
- if (msg.type === 'list-agents') {
559
- // Force a refresh — the user may have just installed a new CLI
560
- // and clicked the dropdown to see the change.
561
- const available = await getAvailability(true);
562
- send(ws, { type: 'agents', payload: { current: currentAgentId, available } });
563
- return;
564
- }
565
782
  if (msg.type === 'switch-agent') {
566
783
  const wanted = msg.payload?.agentId;
567
784
  if (typeof wanted !== 'string' || !wanted) {
@@ -597,46 +814,68 @@ export async function startService(opts) {
597
814
  await broadcastAgents();
598
815
  return;
599
816
  }
600
- if (msg.type === 'set-api-key') {
601
- // The widget supplies (or clears) a model API key. Stored in memory
602
- // only and injected into the spawned CLI's env at invoke time — never
603
- // persisted, never logged, never echoed back. Empty/missing clears it.
604
- const key = msg.payload?.key;
605
- currentApiKey = typeof key === 'string' && key.trim() ? key.trim() : undefined;
606
- const envVar = getAgent(currentAgentId)?.apiKeyEnv;
607
- send(ws, { type: 'api-key-status', payload: { hasKey: !!currentApiKey, envVar } });
817
+ if (msg.type === 'set-model') {
818
+ // Persist the model for subsequent runs (sonnet / opus / haiku / …).
819
+ // Refuse mid-run so an in-flight invocation keeps the model it started
820
+ // with. Applies from the next command.
821
+ const wanted = msg.payload?.model;
822
+ if (typeof wanted !== 'string' || !wanted) {
823
+ send(ws, { type: 'error', payload: { message: 'set-model: model is required' } });
824
+ return;
825
+ }
826
+ if (activeRun) {
827
+ send(ws, { type: 'error', payload: { message: 'set-model: a command is already running; stop it first' } });
828
+ return;
829
+ }
830
+ model = wanted;
831
+ send(ws, { type: 'hello', payload: { agentId: currentAgentId, model, version: PROTOCOL_VERSION } });
608
832
  return;
609
833
  }
610
- if (msg.type === 'list-specs') {
611
- // Widget asks for every spec under <devRoot>/__vibe_tests__/ so it
612
- // can render the Specs tab in the Saved-sessions overlay. Each
613
- // summary carries `originalPrompt` (parsed from the JSDoc header)
614
- // so the Re-record button can resubmit it as a normal command.
615
- const specs = await listSpecs(devRoot);
616
- send(ws, { type: 'specs-list', payload: { specs } });
834
+ if (msg.type === 'set-effort') {
835
+ // Reasoning-effort level for subsequent runs (empty string clears it
836
+ // the agent/model default). Refused mid-run, like set-model.
837
+ const wanted = msg.payload?.effort;
838
+ if (typeof wanted !== 'string') {
839
+ send(ws, { type: 'error', payload: { message: 'set-effort: effort is required' } });
840
+ return;
841
+ }
842
+ if (activeRun) {
843
+ send(ws, { type: 'error', payload: { message: 'set-effort: a command is already running; stop it first' } });
844
+ return;
845
+ }
846
+ currentEffort = wanted || undefined;
617
847
  return;
618
848
  }
619
- if (msg.type === 'list-seeds') {
620
- // Widget's Seeds tab: show which translation seeds Hover sees the
621
- // built-in set + whatever the user dropped in <devRoot>/.hover/rules/.
622
- // Read-only; users add seeds by hand (no download path).
623
- const builtinNames = new Set(BUILTIN_SEEDS.map(s => s.name));
624
- const seeds = (await readSeeds(devRoot)).map(s => ({
625
- name: s.name,
626
- note: s.note ?? '',
627
- signature: s.signature,
628
- code: s.example?.code ?? '',
629
- source: builtinNames.has(s.name) ? 'builtin' : 'project',
630
- }));
631
- send(ws, { type: 'seeds-list', payload: { seeds } });
849
+ if (msg.type === 'set-local-endpoint') {
850
+ // Base URL of the user's self-hosted OpenAI-compatible endpoint for the
851
+ // Local LLM agent (qwen-code as host). Empty string clears it.
852
+ const url = msg.payload?.baseUrl;
853
+ if (typeof url !== 'string') {
854
+ send(ws, { type: 'error', payload: { message: 'set-local-endpoint: baseUrl is required' } });
855
+ return;
856
+ }
857
+ currentLocalBaseUrl = url || undefined;
632
858
  return;
633
859
  }
634
- if (msg.type === 'save-spec') {
635
- await handleSaveArtifact(ws, msg, devRoot, SPEC_CONFIG);
860
+ if (msg.type === 'set-byok') {
861
+ // BYOK config for subsequent runs, or null to fall back to the
862
+ // local-CLI agent's own auth. Refused mid-run, like set-model.
863
+ if (activeRun) {
864
+ send(ws, { type: 'error', payload: { message: 'set-byok: a command is already running; stop it first' } });
865
+ return;
866
+ }
867
+ const c = msg.payload?.config;
868
+ currentByok = c && typeof c.protocol === 'string' ? c : null;
636
869
  return;
637
870
  }
638
- if (msg.type === 'save-case-csv') {
639
- await handleSaveArtifact(ws, msg, devRoot, CASE_CSV_CONFIG);
871
+ if (msg.type === 'refresh-agents') {
872
+ // Re-scan PATH (the user just installed a CLI) and re-broadcast.
873
+ await getAvailability(true);
874
+ await broadcastAgents();
875
+ return;
876
+ }
877
+ if (msg.type === 'save-spec') {
878
+ await handleSaveArtifact(ws, msg, devRoot, SPEC_CONFIG);
640
879
  return;
641
880
  }
642
881
  // Stage 7 (F7) widget flow: optimize a saved spec, then promote/discard
@@ -650,8 +889,15 @@ export async function startService(opts) {
650
889
  return;
651
890
  }
652
891
  try {
892
+ // Optimize is text-only refinement — run it on a CHEAP model: the
893
+ // user's `hover.optimizeModel` setting if set, else the agent's
894
+ // cheapModel (e.g. claude → haiku), else the session model. Keeps the
895
+ // refinement affordable (and viable to run often) without a big model.
896
+ const optimizeModel = (typeof msg.payload?.optimizeModel === 'string' && msg.payload.optimizeModel)
897
+ || getAgent(currentAgentId)?.cheapModel
898
+ || model;
653
899
  const res = await optimizeSpecWithAgent(devRoot, slug, {
654
- agentId: currentAgentId, model, maxBudgetUsd, apiKey: currentApiKey,
900
+ agentId: currentAgentId, model: optimizeModel, maxBudgetUsd,
655
901
  });
656
902
  send(ws, { type: 'optimize-result', payload: { slug, original: res.original, candidate: res.code } });
657
903
  }
@@ -661,31 +907,30 @@ export async function startService(opts) {
661
907
  }
662
908
  return;
663
909
  }
664
- if (msg.type === 'promote-optimized') {
910
+ // Self-heal Stage 2: build the heal prompt for a failed spec and bounce it
911
+ // back. The extension then runs it through the normal run path (runPrompt →
912
+ // command), so the repair streams into chat and crystallizes like any run —
913
+ // no run-path surgery. The failing locator comes from the latest Playwright
914
+ // run JSON (parseRunFailures); absent → buildHealPrompt degrades gracefully.
915
+ if (msg.type === 'heal-spec') {
665
916
  const slug = msg.payload?.slug;
666
- if (typeof slug !== 'string' || !slug) {
667
- send(ws, { type: 'error', payload: { message: 'promote-optimized: slug is required' } });
917
+ const specSource = typeof msg.payload?.specSource === 'string' ? msg.payload.specSource : '';
918
+ if (typeof slug !== 'string' || !slug || !specSource) {
919
+ send(ws, { type: 'error', payload: { message: 'heal-spec: slug and specSource are required' } });
668
920
  return;
669
921
  }
922
+ let failures = [];
670
923
  try {
671
- const path = await promoteOptimized(devRoot, slug);
672
- send(ws, { type: 'optimized-promoted', payload: { slug, path } });
673
- send(ws, { type: 'specs-list', payload: { specs: await listSpecs(devRoot) } });
674
- }
675
- catch (err) {
676
- const m = err instanceof Error ? err.message : String(err);
677
- send(ws, { type: 'error', payload: { message: `promote-optimized: ${m}` } });
678
- }
679
- return;
680
- }
681
- if (msg.type === 'discard-optimized') {
682
- const slug = msg.payload?.slug;
683
- if (typeof slug !== 'string' || !slug) {
684
- send(ws, { type: 'error', payload: { message: 'discard-optimized: slug is required' } });
685
- return;
924
+ const runsDir = join(devRoot, '.hover', 'runs');
925
+ const files = readdirSync(runsDir).filter((f) => f.endsWith('.json')).sort();
926
+ const newest = files.at(-1);
927
+ if (newest) {
928
+ failures = parseRunFailures(readFileSync(join(runsDir, newest), 'utf-8'))
929
+ .filter((f) => f.specFile.includes(slug));
930
+ }
686
931
  }
687
- await discardOptimized(devRoot, slug);
688
- send(ws, { type: 'optimized-discarded', payload: { slug } });
932
+ catch { /* no runs ledger yet — heal from the spec source alone */ }
933
+ send(ws, { type: 'heal-ready', payload: { slug, prompt: buildHealPrompt(slug, specSource, failures), label: healLabel(slug) } });
689
934
  return;
690
935
  }
691
936
  // v0.12 — plugin-contributed save handlers. Lookup is O(plugins),
@@ -721,35 +966,21 @@ export async function startService(opts) {
721
966
  });
722
967
  return;
723
968
  }
724
- if (msg.type === 'check-cdp') {
725
- await handleCheckCdp(ws, msg, cdpUrl, effectiveLaunchExtras());
726
- return;
727
- }
728
969
  if (msg.type === 'launch-chrome') {
729
970
  await handleLaunchChrome(ws, msg, cdpUrl, effectiveLaunchExtras());
730
971
  return;
731
972
  }
732
- if (msg.type === 'focus-debug') {
733
- await handleFocusDebug(ws, msg, cdpUrl, effectiveLaunchExtras());
734
- return;
735
- }
736
973
  if (msg.type !== 'command')
737
974
  return;
738
- const text = msg.payload?.text;
975
+ const rawText = msg.payload?.text;
739
976
  const resumeSessionId = typeof msg.payload?.sessionId === 'string' && msg.payload.sessionId.length > 0
740
977
  ? msg.payload.sessionId
741
978
  : undefined;
742
- // Re-record mode: when the client (widget Specs tab or hover CLI)
743
- // passes `reRecord: { slug }`, runSession collects the tool_use events
744
- // into a SpecStep[] and, on a clean finish, we overwrite the existing
745
- // __vibe_tests__/<slug>.spec.ts. Same flow the widget uses for "Save as
746
- // Spec", but the spec already exists and is being regenerated for the
747
- // current UI.
748
- const reRecordSlug = msg.payload && typeof msg.payload === 'object' && 'reRecord' in msg.payload
749
- ? msg.payload.reRecord?.slug
750
- : undefined;
751
- if (typeof text !== 'string' || !text.trim())
979
+ if (typeof rawText !== 'string' || !rawText.trim())
752
980
  return;
981
+ // `let` (typed string): the classify gate (below) may substitute a refined
982
+ // instruction (e.g. "read the page" → "test this page") before the run uses it.
983
+ let text = rawText;
753
984
  if (activeRun) {
754
985
  send(ws, {
755
986
  type: 'error',
@@ -765,12 +996,177 @@ export async function startService(opts) {
765
996
  prompt: text,
766
997
  };
767
998
  activeRun = run;
999
+ // Session-ledger state — declared outside the try so the catch path can
1000
+ // still record an aborted / thrown run (the spend view wants those too).
1001
+ const sessionStartedAt = new Date().toISOString();
1002
+ // One id per run, generated NOW (run start), so the ledger record, the
1003
+ // screenshots, and the QA report all share one folder. Replaces the old
1004
+ // split between an end-based record id and a start-based screenshotTag.
1005
+ const runId = `${sessionStartedAt.replace(/[:.]/g, '-')}-${Math.random().toString(16).slice(2, 6)}`;
1006
+ // The chat conversation this run belongs to (from the editor); groups all
1007
+ // its runs under one folder so deleting a conversation removes them.
1008
+ const conversationId = typeof msg.payload?.conversationId === 'string' && msg.payload.conversationId
1009
+ ? msg.payload.conversationId
1010
+ : 'default';
1011
+ const runDirPath = runDir(devRoot, conversationId, runId);
1012
+ const runShotDir = join(runDirPath, 'screenshots');
1013
+ let sessionEnd = {};
1014
+ // Findings + clean summary parsed from the ORIGINAL session_end summary,
1015
+ // captured before that summary is stripped of its ## Findings block for the
1016
+ // chat. recordSession reuses these so the ledger record + QA report keep the
1017
+ // findings (re-parsing the stripped summary would lose them).
1018
+ let runParsed = null;
1019
+ let sessionRecorded = false;
1020
+ runCandidates = []; // fresh per run — QA candidate flows accumulate below
1021
+ runResetRecipe = null; // fresh per run — recon may set it
1022
+ pendingPhase2 = null; // cleared each run; the phase split below may re-arm it
1023
+ // Reproducibility context captured up front (snapshot the mode now so a
1024
+ // mid-run switch can't smear it; the rest are filled as the run learns
1025
+ // them). Account labels are LABELS ONLY — never the credentials.
1026
+ const runMode = currentModeId;
1027
+ // QA intensity (per-run): Quick / Standard / Deep — bounds exploration with
1028
+ // a hard model-spend ceiling so "explore the whole app" can't run away.
1029
+ // Only meaningful in QA mode; ignored elsewhere.
1030
+ const runIntensity = asQaIntensity(msg.payload?.intensity);
1031
+ // QA API capability (per-run): when QA's API toggle is on AND the MITM is
1032
+ // available, compose the api-test runtime into this run. The UI only lets
1033
+ // the user turn it on when available, so "on" must actually work — if it's
1034
+ // requested but unavailable, say so loudly (don't silently degrade).
1035
+ const caps = msg.payload?.capabilities;
1036
+ const isPhase2 = msg.payload?.__phase2 === true;
1037
+ const pentestWanted = runMode === 'qa' && caps?.pentest === true && pentestCapabilityAvailable();
1038
+ const apiWanted = runMode === 'qa' && (caps?.api === true || caps?.api === undefined) && apiCapabilityAvailable();
1039
+ // Two-pass: pentest is destructive, so it always runs as a SECOND phase
1040
+ // after the verify phase (functional [+ API]). A first QA run with pentest
1041
+ // on runs verify now and queues a fresh-session pentest phase; the pentest
1042
+ // phase (isPhase2) then runs pentest alone.
1043
+ const splitting = !isPhase2 && pentestWanted;
1044
+ if (splitting) {
1045
+ pentestActiveThisRun = false; // phase 1 = verify (functional + API if on)
1046
+ apiActiveThisRun = apiWanted;
1047
+ pendingPhase2 = {
1048
+ type: 'command',
1049
+ payload: { ...msg.payload, capabilities: { api: false, pentest: true }, sessionId: undefined, __phase2: true },
1050
+ };
1051
+ }
1052
+ else {
1053
+ // Normal QA (no pentest), OR the queued pentest phase itself. Pentest and
1054
+ // API never run in the same phase.
1055
+ pentestActiveThisRun = pentestWanted;
1056
+ apiActiveThisRun = !pentestActiveThisRun && apiWanted;
1057
+ }
1058
+ // Defensive: the UI only enables these toggles when available, so an
1059
+ // explicit "on" should always be honoured. If it somehow isn't, log it
1060
+ // (the run continues as functional-only rather than failing the run).
1061
+ if (runMode === 'qa' && caps?.api === true && !apiActiveThisRun && !pentestActiveThisRun && !splitting && !apiCapabilityAvailable()) {
1062
+ process.stderr.write('[hover/qa] API capability requested but the api-test runtime is unavailable; running functional-only.\n');
1063
+ }
1064
+ if (runMode === 'qa' && caps?.pentest === true && !pentestCapabilityAvailable()) {
1065
+ process.stderr.write('[hover/qa] Pentest capability requested but the pentest runtime is unavailable; running functional-only.\n');
1066
+ }
1067
+ const runResumeOf = resumeSessionId;
1068
+ const runEnv = (() => {
1069
+ const e = msg.payload?.env;
1070
+ return e && typeof e === 'object' ? { id: e.id, name: e.name } : undefined;
1071
+ })();
1072
+ let runTargetUrl;
1073
+ let runAccountLabels;
1074
+ const recordSession = async (outcome, stepCount, detail) => {
1075
+ if (sessionRecorded)
1076
+ return;
1077
+ sessionRecorded = true;
1078
+ const endedAt = new Date().toISOString();
1079
+ // Prefer the findings captured at session_end (from the un-stripped
1080
+ // summary); fall back to parsing detail.summary (error/abort paths).
1081
+ const parsed = runParsed ?? (detail?.summary ? parseFindings(detail.summary) : { summary: '', findings: [] });
1082
+ const toolCounts = detail?.steps ? tallyTools(detail.steps) : undefined;
1083
+ const target = runTargetUrl || runEnv ? { url: runTargetUrl, id: runEnv?.id, name: runEnv?.name } : undefined;
1084
+ const rec = await writeSessionRecord(devRoot, conversationId, runId, {
1085
+ startedAt: sessionStartedAt,
1086
+ endedAt,
1087
+ durationMs: Date.parse(endedAt) - Date.parse(sessionStartedAt),
1088
+ agent: currentAgentId,
1089
+ model,
1090
+ mode: runMode,
1091
+ prompt: text,
1092
+ outcome,
1093
+ errorReason: detail?.errorReason,
1094
+ summary: parsed.summary || undefined,
1095
+ findings: parsed.findings.length ? parsed.findings : undefined,
1096
+ toolCounts: toolCounts && Object.keys(toolCounts).length ? toolCounts : undefined,
1097
+ target: target ? { url: target.url, envId: target.id, envName: target.name } : undefined,
1098
+ accountLabels: runAccountLabels,
1099
+ resumeOf: runResumeOf,
1100
+ turns: sessionEnd.turns,
1101
+ costUsd: sessionEnd.costUsd,
1102
+ tokensUsed: sessionEnd.tokens,
1103
+ stepCount,
1104
+ });
1105
+ // QA mode is report-first: persist a durable Markdown findings report
1106
+ // (mirrors pentest's report file; the chat already shows the Findings
1107
+ // card live). Best-effort — never breaks the run/ledger.
1108
+ if (runMode === 'qa') {
1109
+ const r = await writeQaReport(runDirPath, {
1110
+ prompt: text,
1111
+ summary: parsed.summary,
1112
+ findings: parsed.findings,
1113
+ endedAt,
1114
+ targetUrl: runTargetUrl,
1115
+ });
1116
+ if ('error' in r)
1117
+ process.stderr.write(`[hover/qa] report write failed: ${r.error}\n`);
1118
+ // Surface the report as a clickable artifact in the chat (mirrors the
1119
+ // screenshot event). The editor opens it on click.
1120
+ else if (!run.cancelled)
1121
+ emitToRun({ type: 'qa-report', payload: { path: r.path } });
1122
+ }
1123
+ // Let the active mode's plugin persist its own per-run artifacts bound to
1124
+ // this session id (e.g. api-test writes .hover/api/<id>.json). Best-effort.
1125
+ const sid = rec && 'id' in rec ? rec.id : null;
1126
+ const runEndPlugin = runMode ? pluginsByModeId.get(runMode) : null;
1127
+ if (sid && runEndPlugin?.hooks?.['hover:run:end']) {
1128
+ try {
1129
+ await runEndPlugin.hooks['hover:run:end']({ devRoot, broadcast: broadcastPluginEvent, sessionId: sid });
1130
+ }
1131
+ catch (err) {
1132
+ process.stderr.write(`[hover] plugin "${runEndPlugin.name}" run:end failed: ${err instanceof Error ? err.message : String(err)}\n`);
1133
+ }
1134
+ }
1135
+ // QA + API: persist this run's captured API traffic/checks, then flip the
1136
+ // resident MITM back to passthrough (stop recording). Best-effort.
1137
+ if (apiActiveThisRun && apiTestPlugin) {
1138
+ try {
1139
+ if (sid)
1140
+ await apiTestPlugin.hooks?.['hover:run:end']?.({ devRoot, broadcast: broadcastPluginEvent, sessionId: sid });
1141
+ await apiTestPlugin.hooks?.['hover:mode:deactivate']?.({ devRoot, broadcast: broadcastPluginEvent, modeId: 'qa' });
1142
+ }
1143
+ catch (err) {
1144
+ process.stderr.write(`[hover/qa] api-test compose (run:end) failed: ${err instanceof Error ? err.message : String(err)}\n`);
1145
+ }
1146
+ apiActiveThisRun = false;
1147
+ }
1148
+ // QA + Pentest: stop recording (back to passthrough). The findings are in
1149
+ // the agent's report; a deep PoC report is available via Save. Best-effort.
1150
+ if (pentestActiveThisRun && pentestPlugin) {
1151
+ try {
1152
+ await pentestPlugin.hooks?.['hover:mode:deactivate']?.({ devRoot, broadcast: broadcastPluginEvent, modeId: 'qa' });
1153
+ }
1154
+ catch (err) {
1155
+ process.stderr.write(`[hover/qa] pentest compose (run:end) failed: ${err instanceof Error ? err.message : String(err)}\n`);
1156
+ }
1157
+ pentestActiveThisRun = false;
1158
+ }
1159
+ };
768
1160
  try {
769
1161
  // Build the MCP config first — it's pure local file IO and lets
770
1162
  // us assert plugin-contributed servers landed in the config even
771
1163
  // when CDP preflight subsequently fails (useful for smoke tests
772
1164
  // that don't have a real debug Chrome wired up).
773
- const mcpConfig = buildMcpConfig();
1165
+ // This run's screenshots go in its own folder
1166
+ // (.hover/conversations/<conversationId>/<runId>/screenshots) — runShotDir,
1167
+ // computed at run start so the ledger record + report + shots all share it.
1168
+ const sourceGate = msg.payload?.sourceAccess ?? 'ask';
1169
+ const mcpConfig = buildMcpConfig(runShotDir, sourceGate);
774
1170
  // Preflight: refuse to invoke if CDP isn't reachable. Otherwise the
775
1171
  // Playwright MCP server would silently launch its own Chromium —
776
1172
  // and Hover's premise is to drive the user's existing Chrome (with
@@ -785,14 +1181,85 @@ export async function startService(opts) {
785
1181
  summary: cdp.reason,
786
1182
  },
787
1183
  });
1184
+ // A preflight failure is the most common "why did my run die" — make
1185
+ // it a diagnostic ledger row rather than silently returning.
1186
+ await recordSession('error', 0, { errorReason: cdp.reason });
788
1187
  return;
789
1188
  }
1189
+ // Target URL for the ledger: the localhost tab (the dev server) if we
1190
+ // have one, else the first tab.
1191
+ runTargetUrl =
1192
+ cdp.tabs?.find((t) => /localhost|127\.0\.0\.1/.test(t.url))?.url ?? cdp.tabs?.[0]?.url;
1193
+ // ── Pre-flight classify gate (QA only) ──────────────────────────────
1194
+ // Route the instruction with a cheap one-shot call BEFORE paying for the
1195
+ // full exploratory run. Fresh user instructions only — skip on resume and
1196
+ // on the internal pentest phase-2 re-dispatch (both already vetted).
1197
+ // Fail-open (→ go) lives inside classifyInstruction, so a hiccup never
1198
+ // blocks a legitimate run. 'refuse' / 'clarify' emit a 0-step session_end
1199
+ // and return WITHOUT creating a run folder or ledger record (ephemeral,
1200
+ // like the CDP check); the extension renders a plain reply / clickable
1201
+ // options from the same event a 0-action run produces.
1202
+ if (runMode === 'qa' && !resumeSessionId && !isPhase2 && typeof text === 'string') {
1203
+ // Show immediate activity for the ~1s classify (flips the UI to
1204
+ // "Working"); the real run emits its own session_start on 'go'.
1205
+ emitToRun({ type: 'event', payload: { kind: 'session_start', sessionId: '' } });
1206
+ const classifyAgentId = currentByok ? byokAgentFor(currentByok.protocol) : currentAgentId;
1207
+ let classifyMemory;
1208
+ try {
1209
+ classifyMemory = formatMemoryForPrompt(await loadMemory(devRoot)) || undefined;
1210
+ }
1211
+ catch { /* best-effort */ }
1212
+ const verdict = await classifyInstruction({
1213
+ agentId: classifyAgentId,
1214
+ instruction: text,
1215
+ pageUrl: runTargetUrl,
1216
+ pageTitle: cdp.tabs?.find((t) => t.url === runTargetUrl)?.title,
1217
+ memory: classifyMemory,
1218
+ // Cheap + fast for claude; BYOK / other agents use their configured model.
1219
+ model: classifyAgentId === 'claude' && !currentByok ? 'haiku' : currentByok?.model || model,
1220
+ effort: currentEffort,
1221
+ cwd: msg.payload?.isolateContext === true ? isolatedAgentCwd() : devRoot,
1222
+ env: currentByok
1223
+ ? byokEnvFor(currentByok)
1224
+ : classifyAgentId === 'qwen' && currentLocalBaseUrl
1225
+ ? { OPENAI_BASE_URL: currentLocalBaseUrl, OPENAI_API_KEY: process.env.OPENAI_API_KEY || 'local' }
1226
+ : undefined,
1227
+ signal: run.abort.signal,
1228
+ });
1229
+ if (run.cancelled)
1230
+ return; // user hit Stop during classify
1231
+ if (verdict.route === 'refuse') {
1232
+ pendingPhase2 = null; // don't let a queued pentest phase fire on a refused instruction
1233
+ emitToRun({
1234
+ type: 'event',
1235
+ payload: {
1236
+ kind: 'session_end',
1237
+ isError: false,
1238
+ summary: verdict.reason || 'I can only help test this app — tell me a page, feature, or flow to test.',
1239
+ },
1240
+ });
1241
+ return;
1242
+ }
1243
+ if (verdict.route === 'clarify' && verdict.options && verdict.options.length >= 2) {
1244
+ pendingPhase2 = null;
1245
+ const question = verdict.reason || 'What would you like me to test?';
1246
+ const block = ['```hover-ask', ...verdict.options.map((o) => `- ${o}`), '```'].join('\n');
1247
+ emitToRun({
1248
+ type: 'event',
1249
+ payload: { kind: 'session_end', isError: false, summary: `${question}\n\n${block}` },
1250
+ });
1251
+ return;
1252
+ }
1253
+ // 'go' — run it, substituting the re-interpreted instruction if any.
1254
+ if (verdict.refinedInstruction)
1255
+ text = verdict.refinedInstruction;
1256
+ }
790
1257
  // Build a system-prompt addendum telling the agent about the user's
791
1258
  // current tab. The most common waste we observed: agent calls
792
1259
  // browser_navigate to the same URL the user is already on, triggering
793
- // a wasteful full-page reload that also destroys the Hover widget
794
- // momentarily (the widget re-injects + recovers, but the agent's
795
- // own session sometimes gets confused).
1260
+ // a wasteful full-page reload that discards the app state the run had
1261
+ // built up (login session, form input, position in a flow) — so the
1262
+ // agent has to redo work and sometimes loses track of where it was.
796
1263
  // First turn pays the full rules + narration block; follow-up
797
1264
  // turns (`resumeSessionId` set) get only the volatile tab list.
798
1265
  // The static rules are already in the prior turn's context, and
@@ -824,7 +1291,9 @@ export async function startService(opts) {
824
1291
  // is always-on (treated as if activeInModes was '*').
825
1292
  const scope = add.activeInModes ?? (p.mode ? [p.mode.id] : ['*']);
826
1293
  const inScope = scope.includes('*') ||
827
- (currentModeId !== null && scope.includes(currentModeId));
1294
+ (currentModeId !== null && scope.includes(currentModeId)) ||
1295
+ apiScopeOk(scope) ||
1296
+ pentestScopeOk(scope);
828
1297
  if (inScope) {
829
1298
  appendSystemPrompt = `${appendSystemPrompt}\n\n${add.text}`;
830
1299
  }
@@ -835,7 +1304,23 @@ export async function startService(opts) {
835
1304
  // authoring; white-box confirmation when probing) instead of only
836
1305
  // guessing from the rendered DOM.
837
1306
  if (opts.codeContext) {
838
- appendSystemPrompt = `${appendSystemPrompt}\n\nYou also have read-only access to this project's source via mcp__hover_source (read_source / list_source), fenced to the repo (secrets, keys, .env, .git, node_modules and build output are refused). Use it to read the actual component / route / API code — write tests against the real selectors and, when probing for security issues, confirm a finding against the server code (the query, the authz check) rather than guessing from the page alone.`;
1307
+ appendSystemPrompt = `${appendSystemPrompt}\n\nYou also have read-only access to this project's source via mcp__hoversource (read_source / list_source), fenced to the repo (secrets, keys, .env, .git, node_modules and build output are refused). Use it to read the actual component / route / API code — write tests against the real selectors and, when probing for security issues, confirm a finding against the server code (the query, the authz check) rather than guessing from the page alone.\n\nIMPORTANT — when you get stuck or confused, READ THE CODE before concluding anything: a control you can't operate (a click that does nothing, a field that won't take input), validation that blocks you with no visible reason, a conditional section that won't appear. Use list_source / read_source to open that component's source and look at the real markup, CSS (e.g. visually-hidden / sr-only inputs), event handlers, and state wiring. Base your diagnosis and your next action on what the code actually does — never assert a framework / state / onChange bug you have not seen in the source. Reading source may require the user's one-click approval; if a read is declined or unavailable, just continue from what you can observe on the page and report honestly — do not retry the read in a loop, and do not fall back to guessing an unseen cause.`;
1308
+ }
1309
+ // Test accounts the prompt referenced via @label (resolved by the editor
1310
+ // from its vault). Injected here, NOT in the user-visible transcript, so
1311
+ // the agent can log in; the literal values it types are redacted out of
1312
+ // the saved spec (writeSpec redactions). Never echoed to the user.
1313
+ const runAccounts = Array.isArray(msg.payload?.accounts) ? msg.payload.accounts : [];
1314
+ if (runAccounts.length) {
1315
+ // Ledger keeps LABELS ONLY — never the username/password.
1316
+ runAccountLabels = runAccounts.map((a) => a.label);
1317
+ const lines = runAccounts.map(a => {
1318
+ const role = a.role ? ` (${a.role})` : '';
1319
+ const user = a.username ? `username ${JSON.stringify(a.username)}` : 'username not on file';
1320
+ const pass = a.password ? `, password ${JSON.stringify(a.password)}` : '';
1321
+ return `- @${a.label}${role}: ${user}${pass}`;
1322
+ }).join('\n');
1323
+ appendSystemPrompt = `${appendSystemPrompt}\n\nTest accounts available for this run — when the task refers to an @label, log in using that account's credentials. Use them ONLY to fill authentication fields; never print or echo them in your replies or summaries.\n${lines}`;
839
1324
  }
840
1325
  // Mirror the prompt's language in the agent's *prose* output — the
841
1326
  // verification summary (Result card), the ## Findings block, and the
@@ -848,24 +1333,83 @@ export async function startService(opts) {
848
1333
  if (CJK_RE.test(text)) {
849
1334
  appendSystemPrompt = `${appendSystemPrompt}\n\n${ZH_OUTPUT_DIRECTIVE}`;
850
1335
  }
1336
+ // The report is about the app, never the tooling (all modes).
1337
+ appendSystemPrompt = `${appendSystemPrompt}\n\n${REPORTING_DIRECTIVE}`;
1338
+ // Keep interim narration to one short line per intent (all modes).
1339
+ appendSystemPrompt = `${appendSystemPrompt}\n\n${NARRATION_DIRECTIVE}`;
1340
+ // ASK_FORMAT (propose choices when the request is vague) + EXPLORATION_
1341
+ // CHECKPOINT (ask before stopping with scope left) are for the DIRECTED
1342
+ // modes. QA is autonomous: a vague request means "explore the whole app",
1343
+ // NOT "ask what to test", and QA_EXPLORATION owns its own stop condition —
1344
+ // so skip both for QA (they made QA ask-at-start instead of exploring).
1345
+ if (currentModeId !== 'qa') {
1346
+ appendSystemPrompt = `${appendSystemPrompt}\n\n${ASK_FORMAT_DIRECTIVE}`;
1347
+ appendSystemPrompt = `${appendSystemPrompt}\n\n${EXPLORATION_CHECKPOINT_DIRECTIVE}`;
1348
+ }
1349
+ // Grounded actuation — the agent uses mcp__hover-control__* instead of
1350
+ // the Playwright interaction tools, so saved selectors are role+name,
1351
+ // never a confabulated getByText. Driven by the mode's behavior (Flow +
1352
+ // QA: yes; plugin modes: no), NOT by `currentModeId === null` — a future
1353
+ // built-in mode (QA) has a non-null id but still wants grounded steps.
1354
+ const groundedActuation = resolveModeBehavior(currentModeId).groundedActuation;
1355
+ if (groundedActuation) {
1356
+ appendSystemPrompt = `${appendSystemPrompt}\n\n${GROUNDED_ACTUATION_DIRECTIVE}`;
1357
+ }
1358
+ // State-reset recon (debt-2 reproducible-state-isolation): ONLY when the
1359
+ // extension explicitly asks (it knows whether this env already has a
1360
+ // recipe). Off by default — recon clears client state, which would wipe a
1361
+ // logged-in session, so it must never run unsolicited or on a plain Flow
1362
+ // recording. (Engine plumbing is live; the extension opt-in is piece C.)
1363
+ if (groundedActuation && msg.payload?.reconReset === true) {
1364
+ appendSystemPrompt = `${appendSystemPrompt}\n\n${RECON_DIRECTIVE}`;
1365
+ }
1366
+ // QA mode: autonomous exploratory testing on top of grounded actuation,
1367
+ // bounded by the run's intensity budget (so the agent paces itself and
1368
+ // always writes a report rather than running away on cost).
1369
+ if (currentModeId === 'qa') {
1370
+ appendSystemPrompt = `${appendSystemPrompt}\n\n${QA_EXPLORATION_DIRECTIVE}`;
1371
+ appendSystemPrompt = `${appendSystemPrompt}\n\n${qaBudgetDirective(runIntensity)}`;
1372
+ // Two-pass: this is the functional verify pass with a pentest pass
1373
+ // queued behind it — keep it functional-only so it doesn't duplicate
1374
+ // the pentest pass's security work (the overlap that read like a
1375
+ // double security run).
1376
+ if (splitting)
1377
+ appendSystemPrompt = `${appendSystemPrompt}\n\n${QA_VERIFY_DEFER_SECURITY_DIRECTIVE}`;
1378
+ }
1379
+ // Business memory (QA + API modes only): inject what earlier runs learned
1380
+ // about THIS app so the agent doesn't re-ask answered business questions.
1381
+ // Best-effort — a memory read must never block a run.
1382
+ if (currentModeId === 'qa' || currentModeId === 'api-test') {
1383
+ try {
1384
+ const mem = formatMemoryForPrompt(await loadMemory(devRoot));
1385
+ if (mem)
1386
+ appendSystemPrompt = `${appendSystemPrompt}\n\n${mem}`;
1387
+ }
1388
+ catch { /* memory is best-effort */ }
1389
+ }
851
1390
  // Snapshot the agent id so a switch-agent message during the run
852
1391
  // can't smear two agents across one invocation. (We also gate
853
1392
  // switch-agent on an active run, but defense in depth.) runSession gates
854
1393
  // the allow/deny lists on the agent's sandboxStrength internally.
855
- const invokedAgentId = currentAgentId;
1394
+ // BYOK overrides the active CLI: the protocol picks which CLI is
1395
+ // driven; key/base/model are injected via env below. Otherwise the
1396
+ // user's selected local-CLI agent runs with its own auth.
1397
+ const invokedAgentId = currentByok ? byokAgentFor(currentByok.protocol) : currentAgentId;
1398
+ const effectiveModel = currentByok?.model || model;
856
1399
  // Active mode's plugin-contributed MCP server ids — added to the
857
1400
  // hard-sandbox allow list so Claude can actually call them. Claude
858
1401
  // sanitises non-alphanumeric chars in the id when forming tool
859
- // names (e.g. "@hover-dev/security:flows" → "mcp__hover_dev_security_flows"),
1402
+ // names (e.g. "@hover-dev/api-test:flows" → "mcp__hover_dev_api_test_flows"),
860
1403
  // and `--allowedTools mcp__foo` matches every tool under that
861
1404
  // prefix. We pass the prefix `mcp__<sanitized>` so all of the
862
1405
  // server's tools are reachable.
863
- const activePluginMcpIds = [];
1406
+ // Control actuation is always reachable (every mode).
1407
+ const activePluginMcpIds = [mcpToolPrefix(CONTROL_MCP_ID)];
864
1408
  if (currentModeId) {
865
1409
  for (const p of plugins) {
866
1410
  for (const srv of p.mcpServers ?? []) {
867
1411
  const scope = srv.activeInModes ?? (p.mode ? [p.mode.id] : []);
868
- if (scope.includes('*') || scope.includes(currentModeId)) {
1412
+ if (scope.includes('*') || scope.includes(currentModeId) || apiScopeOk(scope) || pentestScopeOk(scope)) {
869
1413
  activePluginMcpIds.push(mcpToolPrefix(srv.id));
870
1414
  }
871
1415
  }
@@ -874,65 +1418,188 @@ export async function startService(opts) {
874
1418
  // codeContext: the fenced source reader is allowed in every mode.
875
1419
  if (opts.codeContext)
876
1420
  activePluginMcpIds.push(mcpToolPrefix(SOURCE_MCP_ID));
1421
+ // Mark a per-run boundary on the active mode's plugin (api-test scopes its
1422
+ // recorded checks to this run, not the whole session). Best-effort.
1423
+ const runStartPlugin = currentModeId ? pluginsByModeId.get(currentModeId) : null;
1424
+ if (runStartPlugin?.hooks?.['hover:run:start']) {
1425
+ try {
1426
+ await runStartPlugin.hooks['hover:run:start']({ devRoot, broadcast: broadcastPluginEvent });
1427
+ }
1428
+ catch (err) {
1429
+ process.stderr.write(`[hover] plugin "${runStartPlugin.name}" run:start failed: ${err instanceof Error ? err.message : String(err)}\n`);
1430
+ }
1431
+ }
1432
+ // QA + API capability: compose the api-test runtime into this QA run —
1433
+ // flip the resident MITM to intercept (activate) + mark its run boundary,
1434
+ // so the QA agent's API calls are captured/replayable. Mirror-undone at
1435
+ // run end. Best-effort: a hook failure must not break the functional run.
1436
+ if (apiActiveThisRun && apiTestPlugin) {
1437
+ try {
1438
+ await apiTestPlugin.hooks?.['hover:mode:activate']?.({
1439
+ devRoot,
1440
+ broadcast: broadcastPluginEvent,
1441
+ modeId: 'qa',
1442
+ setChromeProxy(proxy) { residentChromeProxy = proxy; },
1443
+ setMcpServerEnv(id, env) { mcpEnvOverrides.set(id, env); },
1444
+ });
1445
+ await apiTestPlugin.hooks?.['hover:run:start']?.({ devRoot, broadcast: broadcastPluginEvent });
1446
+ }
1447
+ catch (err) {
1448
+ process.stderr.write(`[hover/qa] api-test compose (run:start) failed: ${err instanceof Error ? err.message : String(err)}\n`);
1449
+ }
1450
+ }
1451
+ // QA + Pentest capability: compose the pentest runtime — flip the resident
1452
+ // MITM to intercept so the agent's offensive probes are recorded. The
1453
+ // PENTEST_SYSTEM_PROMPT (origin-locked, own-app) is added via the scope
1454
+ // checks above. Mirror-undone at run end. Best-effort.
1455
+ if (pentestActiveThisRun && pentestPlugin) {
1456
+ try {
1457
+ await pentestPlugin.hooks?.['hover:mode:activate']?.({
1458
+ devRoot,
1459
+ broadcast: broadcastPluginEvent,
1460
+ modeId: 'qa',
1461
+ setChromeProxy(proxy) { residentChromeProxy = proxy; },
1462
+ setMcpServerEnv(id, env) { mcpEnvOverrides.set(id, env); },
1463
+ });
1464
+ await pentestPlugin.hooks?.['hover:run:start']?.({ devRoot, broadcast: broadcastPluginEvent });
1465
+ }
1466
+ catch (err) {
1467
+ process.stderr.write(`[hover/qa] pentest compose (run:start) failed: ${err instanceof Error ? err.message : String(err)}\n`);
1468
+ }
1469
+ }
1470
+ // Screenshot previews: this run's MCP output dir (same path buildMcpConfig
1471
+ // uses) + a flag tracking whether the last tool_use was a screenshot, so
1472
+ // we can surface the freshly-written png to the chat as a tool_result lands.
1473
+ // (runShotDir is the run folder's screenshots/, computed at run start.)
1474
+ let pendingShot = null;
1475
+ let lastShotPath = null;
877
1476
  const runResult = await runSession({
878
1477
  agentId: invokedAgentId,
879
1478
  prompt: text,
880
1479
  sessionId: resumeSessionId,
881
1480
  mcpConfig,
882
- // cwd = devRoot so the agent runs against the project (and Claude
883
- // Code reads its CLAUDE.md, if any).
884
- cwd: devRoot,
1481
+ // Memory setting: "shared" (default) → cwd = devRoot, so the agent
1482
+ // gets the project's CLAUDE.md + Claude Code auto-memory. "isolated"
1483
+ // → a throwaway temp cwd, so NONE of the user's CLAUDE.md / memory
1484
+ // leaks into the test agent.
1485
+ cwd: msg.payload?.isolateContext === true ? isolatedAgentCwd() : devRoot,
885
1486
  appendSystemPrompt,
886
1487
  // mcp__playwright covers every browser tool; active-mode plugin MCP
887
1488
  // servers are appended. (Save-as-Skill retired → no Skill tool.)
888
1489
  allowedToolsExtra: activePluginMcpIds,
1490
+ // Normal mode: deny the Playwright interaction tools so the agent
1491
+ // must use the grounded mcp__hover-control__* actuation tools.
1492
+ disallowedToolsExtra: groundedActuation ? GROUNDED_ACTUATION_DENY : undefined,
889
1493
  maxBudgetUsd,
890
- model,
891
- apiKey: currentApiKey,
1494
+ // QA runs are bounded by the chosen intensity's STEP ceiling
1495
+ // (--max-turns); the prompt paces against the same number.
1496
+ maxTurns: runMode === 'qa' ? QA_INTENSITY[runIntensity].maxSteps : undefined,
1497
+ model: effectiveModel,
1498
+ effort: currentEffort,
1499
+ // BYOK: inject the protocol's auth env (key + base URL) into the
1500
+ // matching CLI. Otherwise, Local LLM (qwen host): point qwen at the
1501
+ // user's OpenAI-compatible endpoint via env (the endpoint's key, if
1502
+ // any, falls back to the ambient OPENAI_API_KEY / a placeholder).
1503
+ env: currentByok
1504
+ ? byokEnvFor(currentByok)
1505
+ : invokedAgentId === 'qwen' && currentLocalBaseUrl
1506
+ ? { OPENAI_BASE_URL: currentLocalBaseUrl, OPENAI_API_KEY: process.env.OPENAI_API_KEY || 'local' }
1507
+ : undefined,
892
1508
  signal: run.abort.signal,
893
1509
  }, (ev) => {
1510
+ // Cost/turns/tokens for the session ledger ride the session_end
1511
+ // event — snoop them off the stream. Also track the running `usage`
1512
+ // totals so an aborted/errored run still records partial spend.
1513
+ if (ev.kind === 'session_end') {
1514
+ sessionEnd = { turns: ev.turns, costUsd: ev.costUsd, tokens: ev.tokens };
1515
+ // Structured-first: parse the agent's JSON findings block, hand the
1516
+ // editor the clean summary + structured findings (so the Findings
1517
+ // card renders from data, not a Markdown scrape). All modes.
1518
+ if (typeof ev.summary === 'string' && ev.summary) {
1519
+ const parsed = parseFindings(ev.summary);
1520
+ runParsed = parsed; // keep for the ledger record + QA report
1521
+ ev.summary = parsed.summary;
1522
+ ev.findings = parsed.findings;
1523
+ }
1524
+ }
1525
+ else if (ev.kind === 'usage') {
1526
+ sessionEnd = {
1527
+ turns: ev.turns ?? sessionEnd.turns,
1528
+ costUsd: ev.costUsd ?? sessionEnd.costUsd,
1529
+ tokens: ev.tokens ?? sessionEnd.tokens,
1530
+ };
1531
+ }
1532
+ // Screenshot preview: a take_screenshot tool_use writes a png by the
1533
+ // time its tool_result lands — resolve the freshest png in the run's
1534
+ // output dir and surface it to the chat. Best-effort, never throws.
1535
+ if (ev.kind === 'tool_use') {
1536
+ const bare = String(ev.tool ?? '').replace(/^mcp__.*?__/, '');
1537
+ // browser_take_screenshot (Playwright, plugin modes) OR take_screenshot
1538
+ // (hover-control, grounded modes — viewport only, never resizes the
1539
+ // page). Both write a PNG into the run's shot dir; we surface the
1540
+ // freshest one in the chat.
1541
+ if (bare === 'browser_take_screenshot' || bare === 'take_screenshot') {
1542
+ // browser_take_screenshot may be full-page; take_screenshot is
1543
+ // always viewport. Carry `full` so the chat can collapse a
1544
+ // full+viewport burst and keep the full-page one.
1545
+ pendingShot = { full: Boolean(ev.input?.fullPage) };
1546
+ }
1547
+ }
1548
+ else if (ev.kind === 'tool_result' && pendingShot) {
1549
+ const full = pendingShot.full;
1550
+ pendingShot = null;
1551
+ const shot = newestPng(runShotDir);
1552
+ // Dedupe exact repeats by path (a duplicated tool_use/result resolves
1553
+ // to the same freshest png); distinct full/viewport shots have
1554
+ // distinct paths and are coalesced downstream by the chat instead.
1555
+ if (shot && shot !== lastShotPath && !run.cancelled) {
1556
+ lastShotPath = shot;
1557
+ emitToRun({ type: 'screenshot', payload: { path: shot, full } });
1558
+ }
1559
+ }
894
1560
  // Stream to whichever ws is attached NOW — survives the widget
895
1561
  // reconnecting mid-run (emitToRun is a no-op during a reconnect gap).
896
1562
  if (run.cancelled)
897
1563
  return;
898
1564
  emitToRun({ type: 'event', payload: ev });
899
1565
  });
900
- // Re-record: write a fresh spec from the steps runSession accumulated
901
- // (`user` `step`* `done`). Only on a clean, non-cancelled finish —
902
- // a cancelled/aborted run throws out of runSession into the catch
903
- // below, and an errored agent leaves the original spec untouched.
904
- if (reRecordSlug && !run.cancelled) {
905
- if (runResult.isError) {
906
- emitToRun({
907
- type: 'error',
908
- payload: {
909
- message: `Re-record failed: ${runResult.summary || 'agent reported an error'}. ` +
910
- `Original spec left unchanged.`,
911
- },
912
- });
913
- }
914
- else {
915
- try {
916
- const { writeSpec } = await import('./specs/writeSpec.js');
917
- const written = await writeSpec({
918
- devRoot,
919
- name: reRecordSlug,
920
- steps: runResult.steps,
921
- overwrite: true,
922
- });
923
- emitToRun({
924
- type: 'spec-saved',
925
- payload: { name: reRecordSlug, path: written.path },
926
- });
927
- }
928
- catch (e) {
929
- const m = e instanceof Error ? e.message : String(e);
930
- emitToRun({
931
- type: 'error',
932
- payload: { message: `Re-record could not write spec: ${m}` },
933
- });
1566
+ // Append to the `.hover/sessions/` ledger (best-effort, never throws).
1567
+ // `saved`/`specSlug` are patched in later by markSessionSaved when the
1568
+ // user crystallizes save-spec arrives as a separate WS message.
1569
+ await recordSession(run.cancelled ? 'aborted' : runResult.isError ? 'error' : 'completed', runResult.steps.filter((s) => s.kind === 'step').length, {
1570
+ summary: runResult.summary,
1571
+ errorReason: runResult.isError ? runResult.summary : undefined,
1572
+ steps: runResult.steps,
1573
+ });
1574
+ // QA Stage 4: resolve the agent's recorded candidate flows to their real
1575
+ // recorded steps and offer them as one-click "Crystallize" cards. Steps
1576
+ // are the actual hover-control actuations (record==replay), so each
1577
+ // candidate crystallizes to a clean, runnable spec. Candidates are
1578
+ // functional regression artifacts — the pentest phase produces a findings
1579
+ // report, not specs, so it never offers them (and avoids duplicating the
1580
+ // verify phase's candidates).
1581
+ if (runMode === 'qa' && !pentestActiveThisRun && !run.cancelled) {
1582
+ // Fallback: the agent may finish a clean flow but never call
1583
+ // record_candidate (compliance is unreliable, esp. on short directed
1584
+ // tasks). If it recorded none, offer the whole completed run's grounded
1585
+ // actuations as ONE candidate — crystallization shouldn't depend on the
1586
+ // agent remembering to mark it. Deterministic; the user renames at the
1587
+ // Crystallize prompt. Skipped on error runs and when nothing was acted.
1588
+ if (runCandidates.length === 0 && !runResult.isError) {
1589
+ const grounded = runResult.steps.filter(isCrystallizableStep);
1590
+ if (grounded.some(isRealAction)) {
1591
+ runCandidates.push({ name: 'Recorded flow', steps: grounded });
934
1592
  }
935
1593
  }
1594
+ const resolved = finalizeCandidates(runCandidates);
1595
+ if (resolved.length)
1596
+ emitToRun({ type: 'qa-candidates', payload: { candidates: resolved } });
1597
+ }
1598
+ // Forward a recon-discovered reset recipe to the extension (it owns
1599
+ // .hover/environments.json), keyed to this run's env. The extension
1600
+ // persists it onto the env record (piece C); harmless if unhandled.
1601
+ if (runResetRecipe && runEnv && !run.cancelled) {
1602
+ emitToRun({ type: 'reset-recipe', payload: { envId: runEnv.id, recipe: runResetRecipe } });
936
1603
  }
937
1604
  }
938
1605
  catch (err) {
@@ -950,19 +1617,38 @@ export async function startService(opts) {
950
1617
  summary: message,
951
1618
  };
952
1619
  emitToRun({ type: 'event', payload: errorEvent });
1620
+ await recordSession('error', 0, { errorReason: message });
953
1621
  // Force the next command to re-probe CDP. The error could be from
954
1622
  // Chrome dying, MCP spawning a stray Chromium, the user closing
955
1623
  // their debug window — anything that would make a cached "all
956
1624
  // healthy" result lie.
957
1625
  invalidatePreflight(cdpUrl);
958
1626
  }
1627
+ else {
1628
+ // User-initiated cancel — still worth a ledger row (spend view).
1629
+ await recordSession('aborted', 0, { errorReason: 'Cancelled by the user.' });
1630
+ }
959
1631
  }
960
1632
  finally {
961
1633
  if (run.graceTimer)
962
1634
  clearTimeout(run.graceTimer);
963
1635
  activeRun = null;
964
1636
  }
965
- });
1637
+ // QA two-pass: a verify run with a pentest phase queued behind it. Now that
1638
+ // this (verify) run has finished and activeRun is clear, kick off the
1639
+ // pentest phase as a fresh re-entry — UNLESS the user cancelled. Each phase
1640
+ // is its own agent session (fresh context), so this is the token-cheap way
1641
+ // to sequence them; the pentest phase carries __phase2 so it can't re-split.
1642
+ if (pendingPhase2 && !run.cancelled) {
1643
+ const next = pendingPhase2;
1644
+ pendingPhase2 = null;
1645
+ void onClientMessage(Buffer.from(JSON.stringify(next)));
1646
+ }
1647
+ else {
1648
+ pendingPhase2 = null;
1649
+ }
1650
+ };
1651
+ ws.on('message', onClientMessage);
966
1652
  });
967
1653
  // ───────────────────────── service:start + single Chrome ─────────────────
968
1654
  // Fire plugin `hover:service:start` hooks BEFORE launching Chrome, so a
@@ -1006,6 +1692,7 @@ export async function startService(opts) {
1006
1692
  url: launchUrl,
1007
1693
  port: launchPort,
1008
1694
  proxy: residentChromeProxy ?? undefined,
1695
+ userDataDir,
1009
1696
  })
1010
1697
  .then((r) => {
1011
1698
  if (!r.ok) {
@@ -1060,6 +1747,26 @@ export async function startService(opts) {
1060
1747
  await new Promise((res, rej) => {
1061
1748
  wss.close(err => (err ? rej(err) : res()));
1062
1749
  });
1750
+ // Multi-host model: a per-session host owns its own Chrome (distinct
1751
+ // userDataDir + CDP port). Tear that Chrome down with the host so the
1752
+ // slot's CDP port frees up and a session reusing the slot gets a fresh
1753
+ // browser — not the previous session's logged-in profile. The legacy
1754
+ // single-Chrome model (no userDataDir) deliberately leaves its Chrome
1755
+ // running, reused across runs / dev-server restarts.
1756
+ if (userDataDir) {
1757
+ const launchPort = (() => {
1758
+ try {
1759
+ return Number(new URL(cdpUrl).port) || 9222;
1760
+ }
1761
+ catch {
1762
+ return 9222;
1763
+ }
1764
+ })();
1765
+ try {
1766
+ await closeDebugChrome(launchPort);
1767
+ }
1768
+ catch { /* best-effort */ }
1769
+ }
1063
1770
  },
1064
1771
  };
1065
1772
  }