@hover-dev/core 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/README.md +26 -55
  2. package/dist/agentDirectives.d.ts +55 -0
  3. package/dist/agentDirectives.d.ts.map +1 -0
  4. package/dist/agentDirectives.js +276 -0
  5. package/dist/agents/claude.d.ts.map +1 -1
  6. package/dist/agents/claude.js +28 -3
  7. package/dist/agents/codex.d.ts.map +1 -1
  8. package/dist/agents/codex.js +38 -18
  9. package/dist/agents/gemini.d.ts.map +1 -1
  10. package/dist/agents/gemini.js +3 -14
  11. package/dist/agents/invoke.d.ts.map +1 -1
  12. package/dist/agents/invoke.js +3 -6
  13. package/dist/agents/qwen.d.ts.map +1 -1
  14. package/dist/agents/qwen.js +3 -14
  15. package/dist/agents/registry.d.ts.map +1 -1
  16. package/dist/agents/registry.js +0 -4
  17. package/dist/agents/shared.d.ts +28 -0
  18. package/dist/agents/shared.d.ts.map +1 -0
  19. package/dist/agents/shared.js +35 -0
  20. package/dist/agents/types.d.ts +19 -11
  21. package/dist/agents/types.d.ts.map +1 -1
  22. package/dist/engine.d.ts +53 -0
  23. package/dist/engine.d.ts.map +1 -0
  24. package/dist/engine.js +78 -0
  25. package/dist/mcp/actuateServer.d.ts +3 -0
  26. package/dist/mcp/actuateServer.d.ts.map +1 -0
  27. package/dist/mcp/actuateServer.js +594 -0
  28. package/dist/mcp/sourceFence.d.ts +23 -0
  29. package/dist/mcp/sourceFence.d.ts.map +1 -0
  30. package/dist/mcp/sourceFence.js +79 -0
  31. package/dist/mcp/sourceServer.d.ts +3 -0
  32. package/dist/mcp/sourceServer.d.ts.map +1 -0
  33. package/dist/mcp/sourceServer.js +191 -0
  34. package/dist/memory/businessMemory.d.ts +29 -0
  35. package/dist/memory/businessMemory.d.ts.map +1 -0
  36. package/dist/memory/businessMemory.js +125 -0
  37. package/dist/modes.d.ts +39 -0
  38. package/dist/modes.d.ts.map +1 -0
  39. package/dist/modes.js +34 -0
  40. package/dist/playwright/cdpStatus.d.ts +0 -15
  41. package/dist/playwright/cdpStatus.d.ts.map +1 -1
  42. package/dist/playwright/cdpStatus.js +0 -67
  43. package/dist/playwright/launchChrome.d.ts +18 -0
  44. package/dist/playwright/launchChrome.d.ts.map +1 -1
  45. package/dist/playwright/launchChrome.js +46 -3
  46. package/dist/playwright/preflight.d.ts.map +1 -1
  47. package/dist/playwright/preflight.js +6 -1
  48. package/dist/playwright/resolveMcpConfig.d.ts +12 -0
  49. package/dist/playwright/resolveMcpConfig.d.ts.map +1 -1
  50. package/dist/playwright/resolveMcpConfig.js +36 -5
  51. package/dist/plugin-api.d.ts +35 -26
  52. package/dist/plugin-api.d.ts.map +1 -1
  53. package/dist/plugin-api.js +2 -2
  54. package/dist/qa/candidates.d.ts +32 -0
  55. package/dist/qa/candidates.d.ts.map +1 -0
  56. package/dist/qa/candidates.js +20 -0
  57. package/dist/qa/classify.d.ts +38 -0
  58. package/dist/qa/classify.d.ts.map +1 -0
  59. package/dist/qa/classify.js +138 -0
  60. package/dist/qa/intensity.d.ts +33 -0
  61. package/dist/qa/intensity.d.ts.map +1 -0
  62. package/dist/qa/intensity.js +25 -0
  63. package/dist/qa/qaReport.d.ts +19 -0
  64. package/dist/qa/qaReport.d.ts.map +1 -0
  65. package/dist/qa/qaReport.js +50 -0
  66. package/dist/runSession.d.ts +14 -3
  67. package/dist/runSession.d.ts.map +1 -1
  68. package/dist/runSession.js +31 -11
  69. package/dist/service/cdpHandlers.d.ts +3 -27
  70. package/dist/service/cdpHandlers.d.ts.map +1 -1
  71. package/dist/service/cdpHandlers.js +6 -53
  72. package/dist/service/cdpHint.d.ts +21 -28
  73. package/dist/service/cdpHint.d.ts.map +1 -1
  74. package/dist/service/cdpHint.js +106 -164
  75. package/dist/service/relayHandlers.d.ts +28 -0
  76. package/dist/service/relayHandlers.d.ts.map +1 -0
  77. package/dist/service/relayHandlers.js +105 -0
  78. package/dist/service/saveHandlers.d.ts +1 -3
  79. package/dist/service/saveHandlers.d.ts.map +1 -1
  80. package/dist/service/saveHandlers.js +17 -15
  81. package/dist/service/types.d.ts +108 -8
  82. package/dist/service/types.d.ts.map +1 -1
  83. package/dist/service.d.ts +13 -3
  84. package/dist/service.d.ts.map +1 -1
  85. package/dist/service.js +1022 -236
  86. package/dist/sessions/sessions.d.ts +125 -0
  87. package/dist/sessions/sessions.d.ts.map +1 -0
  88. package/dist/sessions/sessions.js +175 -0
  89. package/dist/specs/authFixture.d.ts +30 -0
  90. package/dist/specs/authFixture.d.ts.map +1 -0
  91. package/dist/specs/authFixture.js +145 -0
  92. package/dist/specs/businessMap.d.ts +29 -0
  93. package/dist/specs/businessMap.d.ts.map +1 -0
  94. package/dist/specs/businessMap.js +95 -0
  95. package/dist/specs/detectSharedFlows.d.ts +1 -1
  96. package/dist/specs/detectSharedFlows.d.ts.map +1 -1
  97. package/dist/specs/detectSharedFlows.js +20 -21
  98. package/dist/specs/generatePageObject.d.ts +1 -1
  99. package/dist/specs/generatePageObject.d.ts.map +1 -1
  100. package/dist/specs/healPrompt.d.ts +19 -0
  101. package/dist/specs/healPrompt.d.ts.map +1 -0
  102. package/dist/specs/healPrompt.js +48 -0
  103. package/dist/specs/humanSteps.d.ts +4 -8
  104. package/dist/specs/humanSteps.d.ts.map +1 -1
  105. package/dist/specs/humanSteps.js +6 -1
  106. package/dist/specs/optimizeSpec.d.ts +15 -8
  107. package/dist/specs/optimizeSpec.d.ts.map +1 -1
  108. package/dist/specs/optimizeSpec.js +98 -46
  109. package/dist/specs/optimizeSpecWithAgent.d.ts +0 -2
  110. package/dist/specs/optimizeSpecWithAgent.d.ts.map +1 -1
  111. package/dist/specs/optimizeSpecWithAgent.js +0 -1
  112. package/dist/specs/pageObjectManifest.d.ts +3 -1
  113. package/dist/specs/pageObjectManifest.d.ts.map +1 -1
  114. package/dist/specs/pageObjectManifest.js +13 -9
  115. package/dist/specs/replayGrounded.d.ts +45 -0
  116. package/dist/specs/replayGrounded.d.ts.map +1 -0
  117. package/dist/specs/replayGrounded.js +155 -0
  118. package/dist/specs/runFailures.d.ts +34 -0
  119. package/dist/specs/runFailures.d.ts.map +1 -0
  120. package/dist/specs/runFailures.js +93 -0
  121. package/dist/specs/seeds.d.ts +16 -15
  122. package/dist/specs/seeds.d.ts.map +1 -1
  123. package/dist/specs/seeds.js +86 -54
  124. package/dist/specs/sidecar.d.ts +34 -6
  125. package/dist/specs/sidecar.d.ts.map +1 -1
  126. package/dist/specs/sidecar.js +79 -9
  127. package/dist/specs/softBatch.d.ts +14 -0
  128. package/dist/specs/softBatch.d.ts.map +1 -0
  129. package/dist/specs/softBatch.js +177 -0
  130. package/dist/specs/specStep.d.ts +21 -0
  131. package/dist/specs/specStep.d.ts.map +1 -0
  132. package/dist/specs/specStep.js +1 -0
  133. package/dist/specs/text.d.ts +19 -0
  134. package/dist/specs/text.d.ts.map +1 -0
  135. package/dist/specs/text.js +27 -0
  136. package/dist/specs/writeSpec.d.ts +62 -1
  137. package/dist/specs/writeSpec.d.ts.map +1 -1
  138. package/dist/specs/writeSpec.js +598 -30
  139. package/package.json +10 -10
  140. package/dist/agents/aider.d.ts +0 -16
  141. package/dist/agents/aider.d.ts.map +0 -1
  142. package/dist/agents/aider.js +0 -169
  143. package/dist/agents/cursor.d.ts +0 -18
  144. package/dist/agents/cursor.d.ts.map +0 -1
  145. package/dist/agents/cursor.js +0 -229
  146. package/dist/playwright/raiseWindow.d.ts +0 -10
  147. package/dist/playwright/raiseWindow.d.ts.map +0 -1
  148. package/dist/playwright/raiseWindow.js +0 -139
  149. package/dist/scripts/bench-multi-tab.d.ts +0 -2
  150. package/dist/scripts/bench-multi-tab.d.ts.map +0 -1
  151. package/dist/scripts/bench-multi-tab.js +0 -192
  152. package/dist/scripts/bench-ttfb.d.ts +0 -2
  153. package/dist/scripts/bench-ttfb.d.ts.map +0 -1
  154. package/dist/scripts/bench-ttfb.js +0 -127
  155. package/dist/scripts/start-chrome.d.ts +0 -3
  156. package/dist/scripts/start-chrome.d.ts.map +0 -1
  157. package/dist/scripts/start-chrome.js +0 -23
  158. package/dist/skills/writeSkill.d.ts +0 -27
  159. package/dist/skills/writeSkill.d.ts.map +0 -1
  160. package/dist/skills/writeSkill.js +0 -13
  161. package/dist/specs/listSpecs.d.ts +0 -52
  162. package/dist/specs/listSpecs.d.ts.map +0 -1
  163. package/dist/specs/listSpecs.js +0 -139
  164. package/dist/specs/optimizationSuggestion.d.ts +0 -26
  165. package/dist/specs/optimizationSuggestion.d.ts.map +0 -1
  166. package/dist/specs/optimizationSuggestion.js +0 -28
  167. package/dist/specs/writeCaseCsv.d.ts +0 -28
  168. package/dist/specs/writeCaseCsv.d.ts.map +0 -1
  169. package/dist/specs/writeCaseCsv.js +0 -140
@@ -0,0 +1,25 @@
1
+ export const QA_INTENSITY = {
2
+ quick: { label: 'Quick', maxSteps: 45, blurb: 'a fast pass over the main flows — breadth over depth (~20–45 steps)' },
3
+ standard: { label: 'Standard', maxSteps: 150, blurb: 'the main flows plus key negative tests (~45–150 steps)' },
4
+ deep: { label: 'Deep', maxSteps: 500, blurb: 'exhaustive — every reachable control and state (~150–500 steps)' },
5
+ };
6
+ export const DEFAULT_QA_INTENSITY = 'standard';
7
+ /** Coerce arbitrary input (from the run payload) to a valid intensity. */
8
+ export function asQaIntensity(v) {
9
+ return v === 'quick' || v === 'deep' || v === 'standard' ? v : DEFAULT_QA_INTENSITY;
10
+ }
11
+ /**
12
+ * Prompt directive: tell the agent its STEP budget so it paces and ALWAYS wraps
13
+ * up with a report before the ceiling. The `--max-turns` backstop is the hard
14
+ * limit; this prose is what guarantees a report.
15
+ */
16
+ export function qaBudgetDirective(intensity) {
17
+ const spec = QA_INTENSITY[intensity];
18
+ const wrapAt = Math.max(5, spec.maxSteps - Math.ceil(spec.maxSteps * 0.1));
19
+ return (`RUN BUDGET — ${spec.label}: ${spec.blurb}. You have about ${spec.maxSteps} steps ` +
20
+ `(tool actions) this run, enforced. Pace yourself to fit: cover the most ` +
21
+ `important flows FIRST. By roughly step ${wrapAt}, STOP exploring and ` +
22
+ `immediately WRITE YOUR FINDINGS REPORT (and record any clean candidate flows) ` +
23
+ `while you still can — never end a run without a report. On Quick, be decisive ` +
24
+ `and favour breadth; on Deep, be exhaustive.`);
25
+ }
@@ -0,0 +1,19 @@
1
+ import type { SessionFinding } from '../sessions/sessions.js';
2
+ export interface QaReportInput {
3
+ prompt: string;
4
+ summary: string;
5
+ findings: SessionFinding[];
6
+ endedAt: string;
7
+ targetUrl?: string;
8
+ }
9
+ /** Render the report Markdown (pure — exported for testing). */
10
+ export declare function renderQaReport(input: QaReportInput): string;
11
+ /** Write the QA report into the run's folder as `report.md`. Each run (incl.
12
+ * each phase of a two-pass run) has its own folder, so there's no name
13
+ * collision. NEVER throws; returns the path or an error string. */
14
+ export declare function writeQaReport(runDirPath: string, input: QaReportInput): Promise<{
15
+ path: string;
16
+ } | {
17
+ error: string;
18
+ }>;
19
+ //# sourceMappingURL=qaReport.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"qaReport.d.ts","sourceRoot":"","sources":["../../src/qa/qaReport.ts"],"names":[],"mappings":"AAcA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,yBAAyB,CAAC;AAE9D,MAAM,WAAW,aAAa;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,cAAc,EAAE,CAAC;IAC3B,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,gEAAgE;AAChE,wBAAgB,cAAc,CAAC,KAAK,EAAE,aAAa,GAAG,MAAM,CAkB3D;AAED;;oEAEoE;AACpE,wBAAsB,aAAa,CACjC,UAAU,EAAE,MAAM,EAClB,KAAK,EAAE,aAAa,GACnB,OAAO,CAAC;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,GAAG;IAAE,KAAK,EAAE,MAAM,CAAA;CAAE,CAAC,CAS/C"}
@@ -0,0 +1,50 @@
1
+ /**
2
+ * QA report artifact — the durable, human-readable output of a QA Testing run.
3
+ *
4
+ * QA is report-first: a run produces findings (rendered live in the chat's
5
+ * Findings card via the normal parseFindings pipeline) AND this persistent
6
+ * Markdown report under `<devRoot>/.hover/qa-reports/<slug>.md`, mirroring
7
+ * pentest's report file. Latest-run-wins per prompt slug (the session ledger
8
+ * keeps the full history; this is the readable artifact).
9
+ *
10
+ * Best-effort by contract: a report-write failure must NEVER break a run or the
11
+ * ledger (same rule as the session ledger + business memory).
12
+ */
13
+ import { mkdir, writeFile } from 'node:fs/promises';
14
+ import { join } from 'node:path';
15
+ /** Render the report Markdown (pure — exported for testing). */
16
+ export function renderQaReport(input) {
17
+ const { prompt, summary, findings, endedAt, targetUrl } = input;
18
+ const meta = [endedAt, targetUrl, `${findings.length} finding${findings.length === 1 ? '' : 's'}`]
19
+ .filter(Boolean)
20
+ .join(' · ');
21
+ const body = [`# QA report — ${prompt.trim()}`, '', `_${meta}_`];
22
+ if (summary.trim())
23
+ body.push('', summary.trim());
24
+ body.push('', '## Findings');
25
+ if (findings.length) {
26
+ for (const f of findings) {
27
+ const sev = (f.severity || 'note').trim();
28
+ const head = f.title && f.title !== f.text ? `${f.title} — ` : '';
29
+ body.push(`- **${sev}** — ${head}${f.text.trim()}`);
30
+ }
31
+ }
32
+ else {
33
+ body.push('_No issues found._');
34
+ }
35
+ return body.join('\n') + '\n';
36
+ }
37
+ /** Write the QA report into the run's folder as `report.md`. Each run (incl.
38
+ * each phase of a two-pass run) has its own folder, so there's no name
39
+ * collision. NEVER throws; returns the path or an error string. */
40
+ export async function writeQaReport(runDirPath, input) {
41
+ try {
42
+ await mkdir(runDirPath, { recursive: true });
43
+ const path = join(runDirPath, 'report.md');
44
+ await writeFile(path, renderQaReport(input), 'utf-8');
45
+ return { path };
46
+ }
47
+ catch (err) {
48
+ return { error: err instanceof Error ? err.message : String(err) };
49
+ }
50
+ }
@@ -1,5 +1,5 @@
1
1
  import type { InvokeEvent } from './agents/types.js';
2
- import type { SkillStep } from './skills/writeSkill.js';
2
+ import type { SkillStep } from './specs/specStep.js';
3
3
  export interface RunSessionOptions {
4
4
  prompt: string;
5
5
  agentId: string;
@@ -7,9 +7,14 @@ export interface RunSessionOptions {
7
7
  * is supplied (the service passes a pre-built config; the CLI passes this). */
8
8
  cdpUrl?: string;
9
9
  model?: string;
10
+ /** Reasoning-effort level forwarded to the agent (claude --effort / codex
11
+ * -c model_reasoning_effort). Undefined = agent/model default. */
12
+ effort?: string;
13
+ /** Extra env for the spawned CLI (Local LLM: OPENAI_BASE_URL / _API_KEY). */
14
+ env?: Record<string, string>;
10
15
  maxBudgetUsd?: number;
11
- /** Optional model API key, injected into the spawned CLI's env. */
12
- apiKey?: string;
16
+ /** Hard ceiling on agent turns (~steps) QA intensity step budget. */
17
+ maxTurns?: number;
13
18
  /** Agent cwd (project root) — where Claude Code reads CLAUDE.md and where a
14
19
  * `--save` / re-record writes the spec. Defaults to the process cwd. */
15
20
  cwd?: string;
@@ -23,6 +28,12 @@ export interface RunSessionOptions {
23
28
  /** Extra hard-sandbox allow-list prefixes — e.g. active-mode plugin MCP
24
29
  * server ids the service contributes. Appended to ['mcp__playwright']. */
25
30
  allowedToolsExtra?: string[];
31
+ /** Extra hard-sandbox deny entries — specific tools to forbid even though
32
+ * their server is allowed. Normal mode passes the Playwright interaction
33
+ * tools (browser_click / _type / _fill_form / _select_option) here so the
34
+ * agent must use the grounded mcp__hover-control__* actuation tools, whose
35
+ * role+name selectors crystallize 1:1 instead of confabulating getByText. */
36
+ disallowedToolsExtra?: string[];
26
37
  /** Appended to the agent's system prompt (the service folds in cdpHint +
27
38
  * conventions + plugin additions + a language directive; the CLI omits it). */
28
39
  appendSystemPrompt?: string;
@@ -1 +1 @@
1
- {"version":3,"file":"runSession.d.ts","sourceRoot":"","sources":["../src/runSession.ts"],"names":[],"mappings":"AAoBA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,wBAAwB,CAAC;AAGxD,MAAM,WAAW,iBAAiB;IAChC,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB;oFACgF;IAChF,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,mEAAmE;IACnE,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB;6EACyE;IACzE,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,kEAAkE;IAClE,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB;;yCAEqC;IACrC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;+EAC2E;IAC3E,iBAAiB,CAAC,EAAE,MAAM,EAAE,CAAC;IAC7B;oFACgF;IAChF,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,2DAA2D;IAC3D,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,gBAAgB;IAC/B;mCAC+B;IAC/B,KAAK,EAAE,SAAS,EAAE,CAAC;IACnB,yCAAyC;IACzC,OAAO,EAAE,MAAM,CAAC;IAChB,qDAAqD;IACrD,OAAO,EAAE,OAAO,CAAC;CAClB;AAED,wBAAsB,UAAU,CAC9B,IAAI,EAAE,iBAAiB,EACvB,OAAO,EAAE,CAAC,EAAE,EAAE,WAAW,KAAK,IAAI,GACjC,OAAO,CAAC,gBAAgB,CAAC,CAuD3B"}
1
+ {"version":3,"file":"runSession.d.ts","sourceRoot":"","sources":["../src/runSession.ts"],"names":[],"mappings":"AAoBA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAGrD,MAAM,WAAW,iBAAiB;IAChC,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB;oFACgF;IAChF,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;uEACmE;IACnE,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,6EAA6E;IAC7E,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC7B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,uEAAuE;IACvE,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB;6EACyE;IACzE,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,kEAAkE;IAClE,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB;;yCAEqC;IACrC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;+EAC2E;IAC3E,iBAAiB,CAAC,EAAE,MAAM,EAAE,CAAC;IAC7B;;;;kFAI8E;IAC9E,oBAAoB,CAAC,EAAE,MAAM,EAAE,CAAC;IAChC;oFACgF;IAChF,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,2DAA2D;IAC3D,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,gBAAgB;IAC/B;mCAC+B;IAC/B,KAAK,EAAE,SAAS,EAAE,CAAC;IACnB,yCAAyC;IACzC,OAAO,EAAE,MAAM,CAAC;IAChB,qDAAqD;IACrD,OAAO,EAAE,OAAO,CAAC;CAClB;AAED,wBAAsB,UAAU,CAC9B,IAAI,EAAE,iBAAiB,EACvB,OAAO,EAAE,CAAC,EAAE,EAAE,WAAW,KAAK,IAAI,GACjC,OAAO,CAAC,gBAAgB,CAAC,CAyE3B"}
@@ -27,6 +27,11 @@ export async function runSession(opts, onEvent) {
27
27
  const steps = [{ kind: 'user', text: opts.prompt }];
28
28
  let summary = '';
29
29
  let isError = false;
30
+ // Index of the most recently captured tool step, so the tool_result that
31
+ // follows can mark whether that action errored. Without this, every captured
32
+ // step looks successful and the agent's failed exploration attempts get
33
+ // crystallized into the spec as if they were real flow.
34
+ let lastStepIdx = -1;
30
35
  const mcpConfig = opts.mcpConfig ??
31
36
  resolveMcpConfig({
32
37
  cdpUrl: opts.cdpUrl ?? 'http://localhost:9222',
@@ -43,25 +48,35 @@ export async function runSession(opts, onEvent) {
43
48
  mcpConfig,
44
49
  cwd: opts.cwd,
45
50
  appendSystemPrompt: opts.appendSystemPrompt,
46
- // Hard sandbox: only Playwright MCP (+ any active-mode plugin servers) is
47
- // callable, every built-in tool denied a hijacked prompt can't reach the
48
- // shell or filesystem. Soft agents (codex, …) enforce their own sandbox via
49
- // buildArgs, so the lists stay undefined for them — exactly what the
50
- // service does.
51
- allowedTools: isHardSandbox
52
- ? ['mcp__playwright', ...(opts.allowedToolsExtra ?? [])]
53
- : undefined,
51
+ // The allowed-tool set (Playwright MCP + the active mode's plugin servers:
52
+ // hover-control, api-test flows, source reader, …) is the SAME for every
53
+ // agent hard-sandbox agents enforce it via --allowedTools; soft agents
54
+ // (codex) surface it in their developer_instructions so they don't
55
+ // self-restrict to Playwright and refuse the plugin tools (e.g. api_request).
56
+ // The DISallow list is hard-sandbox only (soft agents can't enforce it).
57
+ allowedTools: ['mcp__playwright', ...(opts.allowedToolsExtra ?? [])],
54
58
  disallowedTools: isHardSandbox
55
- ? (descriptor?.defaultDisallowedTools ? [...descriptor.defaultDisallowedTools] : undefined)
59
+ ? [...(descriptor?.defaultDisallowedTools ?? []), ...(opts.disallowedToolsExtra ?? [])]
56
60
  : undefined,
57
61
  maxBudgetUsd: opts.maxBudgetUsd,
62
+ maxTurns: opts.maxTurns,
58
63
  model: opts.model,
59
- apiKey: opts.apiKey,
64
+ effort: opts.effort,
65
+ env: opts.env,
60
66
  signal: opts.signal,
61
67
  })) {
62
68
  onEvent(ev);
63
69
  if (ev.kind === 'tool_use') {
64
- steps.push({ kind: 'step', tool: ev.tool, input: ev.input });
70
+ lastStepIdx = steps.push({ kind: 'step', tool: ev.tool, input: ev.input }) - 1;
71
+ }
72
+ else if (ev.kind === 'tool_result') {
73
+ // Mark the step this result belongs to (the normalized stream emits
74
+ // tool_result right after its tool_use). A failed action stays in the
75
+ // sidecar as part of the full-fidelity record, but writeSpec drops it from
76
+ // the runnable spec so the artifact reflects the working flow, not the agent's
77
+ // trial-and-error.
78
+ if (lastStepIdx >= 0 && ev.isError)
79
+ steps[lastStepIdx].isError = true;
65
80
  }
66
81
  else if (ev.kind === 'session_end') {
67
82
  if (ev.summary)
@@ -70,6 +85,11 @@ export async function runSession(opts, onEvent) {
70
85
  isError = true;
71
86
  }
72
87
  }
88
+ // On abort (opts.signal), invokeAgent SIGTERMs the child and no session_end
89
+ // arrives, so the error flag above never gets set. Honour the doc contract
90
+ // ("True if the run ended in error or was aborted") by flipping it here.
91
+ if (opts.signal?.aborted)
92
+ isError = true;
73
93
  if (summary)
74
94
  steps.push({ kind: 'done', summary });
75
95
  return { steps, summary, isError };
@@ -1,11 +1,8 @@
1
1
  /**
2
2
  * CDP-related WebSocket message handlers.
3
3
  *
4
- * check-cdp → checkCdpStatus → emit cdp-status
5
4
  * launch-chrome → emit "launching" placeholder → launchDebugChrome →
6
5
  * re-check status → emit cdp-status
7
- * focus-debug → focusDebugTab → no message on success (the widget the
8
- * user is about to focus runs its own check-cdp anyway)
9
6
  *
10
7
  * Extracted from service.ts during the v0.2.x refactor pass so the main
11
8
  * file can be a thin orchestrator.
@@ -14,22 +11,9 @@ import type { WebSocket } from 'ws';
14
11
  import { type LaunchOptions } from '../playwright/launchChrome.js';
15
12
  import { type ClientMessage } from './types.js';
16
13
  /** Extra launch options surfaced from the active mode (security plugin
17
- * needs proxy + spki + separate profile + non-default CDP port). When
18
- * none are set, behaviour is identical to pre-v0.7 normal-mode launch. */
19
- export type LaunchExtras = Pick<LaunchOptions, 'userDataDir' | 'proxy'> & {
20
- /** Override CDP port (mode-specific, e.g. 9333 for security). When set,
21
- * this also wins over the `port` parsed from cdpUrl. */
22
- cdpPort?: number;
23
- };
24
- /**
25
- * "Is this widget running inside the debug Chrome?" The widget asks this on
26
- * connect (and after every status-changing event) so it can render itself as
27
- * either:
28
- * - same-window → normal, drives the page
29
- * - wrong-window → disabled, with a "use the other window" notice
30
- * - no-cdp → enabled but click triggers launch-chrome instead
31
- */
32
- export declare function handleCheckCdp(ws: WebSocket, msg: ClientMessage, cdpUrl: string, extras?: LaunchExtras): Promise<void>;
14
+ * needs a resident proxy + spki). When none are set, behaviour is identical
15
+ * to pre-v0.7 normal-mode launch. */
16
+ export type LaunchExtras = Pick<LaunchOptions, 'proxy' | 'userDataDir'>;
33
17
  /**
34
18
  * Launch a debug Chrome navigated to `pageUrl`, then re-check status. The
35
19
  * re-check usually returns 'wrong-window' (because the widget asking is in
@@ -37,12 +21,4 @@ export declare function handleCheckCdp(ws: WebSocket, msg: ClientMessage, cdpUrl
37
21
  * displays the "use the other window" state.
38
22
  */
39
23
  export declare function handleLaunchChrome(ws: WebSocket, msg: ClientMessage, cdpUrl: string, extras?: LaunchExtras): Promise<void>;
40
- /**
41
- * bringToFront the debug-Chrome tab matching `pageUrl`'s origin (or open one
42
- * if none exists). Used by the wrong-window UI's "switch to debug Chrome"
43
- * button. Doesn't return cdp-status — bringToFront doesn't change anything
44
- * the widget cares about, and the widget the user is about to focus is a
45
- * different page (and will run its own check-cdp on its own ws connection).
46
- */
47
- export declare function handleFocusDebug(ws: WebSocket, msg: ClientMessage, cdpUrl: string, extras?: LaunchExtras): Promise<void>;
48
24
  //# sourceMappingURL=cdpHandlers.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"cdpHandlers.d.ts","sourceRoot":"","sources":["../../src/service/cdpHandlers.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAEpC,OAAO,EAAqB,KAAK,aAAa,EAAE,MAAM,+BAA+B,CAAC;AACtF,OAAO,EAAQ,KAAK,aAAa,EAAE,MAAM,YAAY,CAAC;AAEtD;;2EAE2E;AAC3E,MAAM,MAAM,YAAY,GAAG,IAAI,CAAC,aAAa,EAAE,aAAa,GAAG,OAAO,CAAC,GAAG;IACxE;6DACyD;IACzD,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB,CAAC;AAEF;;;;;;;GAOG;AACH,wBAAsB,cAAc,CAClC,EAAE,EAAE,SAAS,EACb,GAAG,EAAE,aAAa,EAClB,MAAM,EAAE,MAAM,EACd,MAAM,CAAC,EAAE,YAAY,GACpB,OAAO,CAAC,IAAI,CAAC,CAWf;AAED;;;;;GAKG;AACH,wBAAsB,kBAAkB,CACtC,EAAE,EAAE,SAAS,EACb,GAAG,EAAE,aAAa,EAClB,MAAM,EAAE,MAAM,EACd,MAAM,CAAC,EAAE,YAAY,GACpB,OAAO,CAAC,IAAI,CAAC,CAkCf;AAED;;;;;;GAMG;AACH,wBAAsB,gBAAgB,CACpC,EAAE,EAAE,SAAS,EACb,GAAG,EAAE,aAAa,EAClB,MAAM,EAAE,MAAM,EACd,MAAM,CAAC,EAAE,YAAY,GACpB,OAAO,CAAC,IAAI,CAAC,CAaf"}
1
+ {"version":3,"file":"cdpHandlers.d.ts","sourceRoot":"","sources":["../../src/service/cdpHandlers.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,IAAI,CAAC;AAEpC,OAAO,EAAqB,KAAK,aAAa,EAAE,MAAM,+BAA+B,CAAC;AACtF,OAAO,EAAQ,KAAK,aAAa,EAAE,MAAM,YAAY,CAAC;AAEtD;;sCAEsC;AACtC,MAAM,MAAM,YAAY,GAAG,IAAI,CAAC,aAAa,EAAE,OAAO,GAAG,aAAa,CAAC,CAAC;AAExE;;;;;GAKG;AACH,wBAAsB,kBAAkB,CACtC,EAAE,EAAE,SAAS,EACb,GAAG,EAAE,aAAa,EAClB,MAAM,EAAE,MAAM,EACd,MAAM,CAAC,EAAE,YAAY,GACpB,OAAO,CAAC,IAAI,CAAC,CA+Bf"}
@@ -1,38 +1,15 @@
1
1
  /**
2
2
  * CDP-related WebSocket message handlers.
3
3
  *
4
- * check-cdp → checkCdpStatus → emit cdp-status
5
4
  * launch-chrome → emit "launching" placeholder → launchDebugChrome →
6
5
  * re-check status → emit cdp-status
7
- * focus-debug → focusDebugTab → no message on success (the widget the
8
- * user is about to focus runs its own check-cdp anyway)
9
6
  *
10
7
  * Extracted from service.ts during the v0.2.x refactor pass so the main
11
8
  * file can be a thin orchestrator.
12
9
  */
13
- import { checkCdpStatus, focusDebugTab } from '../playwright/cdpStatus.js';
10
+ import { checkCdpStatus } from '../playwright/cdpStatus.js';
14
11
  import { launchDebugChrome } from '../playwright/launchChrome.js';
15
12
  import { send } from './types.js';
16
- /**
17
- * "Is this widget running inside the debug Chrome?" The widget asks this on
18
- * connect (and after every status-changing event) so it can render itself as
19
- * either:
20
- * - same-window → normal, drives the page
21
- * - wrong-window → disabled, with a "use the other window" notice
22
- * - no-cdp → enabled but click triggers launch-chrome instead
23
- */
24
- export async function handleCheckCdp(ws, msg, cdpUrl, extras) {
25
- const pageUrl = msg.payload?.pageUrl;
26
- if (typeof pageUrl !== 'string' || !pageUrl) {
27
- send(ws, { type: 'error', payload: { message: 'check-cdp: pageUrl is required' } });
28
- return;
29
- }
30
- const effectiveCdpUrl = extras?.cdpPort
31
- ? `http://localhost:${extras.cdpPort}`
32
- : cdpUrl;
33
- const status = await checkCdpStatus(effectiveCdpUrl, pageUrl);
34
- send(ws, { type: 'cdp-status', payload: status });
35
- }
36
13
  /**
37
14
  * Launch a debug Chrome navigated to `pageUrl`, then re-check status. The
38
15
  * re-check usually returns 'wrong-window' (because the widget asking is in
@@ -48,7 +25,7 @@ export async function handleLaunchChrome(ws, msg, cdpUrl, extras) {
48
25
  // Tell the widget we're launching so it can render a spinner immediately —
49
26
  // findChromeBinary + spawn + ready-poll can take a few seconds.
50
27
  send(ws, { type: 'cdp-status', payload: { state: 'no-cdp', launching: true } });
51
- const port = extras?.cdpPort ?? (() => {
28
+ const port = (() => {
52
29
  try {
53
30
  return Number(new URL(cdpUrl).port) || 9222;
54
31
  }
@@ -59,39 +36,15 @@ export async function handleLaunchChrome(ws, msg, cdpUrl, extras) {
59
36
  const result = await launchDebugChrome({
60
37
  url: pageUrl,
61
38
  port,
62
- userDataDir: extras?.userDataDir,
63
39
  proxy: extras?.proxy,
40
+ userDataDir: extras?.userDataDir,
41
+ headless: msg.payload?.headless === true,
42
+ force: msg.payload?.force === true,
64
43
  });
65
44
  if (!result.ok) {
66
45
  send(ws, { type: 'cdp-status', payload: { state: 'no-cdp', reason: result.reason } });
67
46
  return;
68
47
  }
69
- // Re-check status against the port we actually launched on, so a mode-
70
- // specific port (9333 for security) doesn't get probed at 9222.
71
- const effectiveCdpUrl = extras?.cdpPort
72
- ? `http://localhost:${extras.cdpPort}`
73
- : cdpUrl;
74
- const status = await checkCdpStatus(effectiveCdpUrl, pageUrl);
48
+ const status = await checkCdpStatus(cdpUrl, pageUrl);
75
49
  send(ws, { type: 'cdp-status', payload: status });
76
50
  }
77
- /**
78
- * bringToFront the debug-Chrome tab matching `pageUrl`'s origin (or open one
79
- * if none exists). Used by the wrong-window UI's "switch to debug Chrome"
80
- * button. Doesn't return cdp-status — bringToFront doesn't change anything
81
- * the widget cares about, and the widget the user is about to focus is a
82
- * different page (and will run its own check-cdp on its own ws connection).
83
- */
84
- export async function handleFocusDebug(ws, msg, cdpUrl, extras) {
85
- const pageUrl = msg.payload?.pageUrl;
86
- if (typeof pageUrl !== 'string' || !pageUrl) {
87
- send(ws, { type: 'error', payload: { message: 'focus-debug: pageUrl is required' } });
88
- return;
89
- }
90
- const effectiveCdpUrl = extras?.cdpPort
91
- ? `http://localhost:${extras.cdpPort}`
92
- : cdpUrl;
93
- const result = await focusDebugTab(effectiveCdpUrl, pageUrl);
94
- if (!result.ok) {
95
- send(ws, { type: 'error', payload: { message: `focus-debug: ${result.reason}` } });
96
- }
97
- }
@@ -1,27 +1,24 @@
1
1
  /**
2
2
  * System-prompt addendum sent to the agent on every command.
3
3
  *
4
- * Two roles:
5
- * 1. Navigation rules the most failure-prone agent behaviours are
6
- * `browser_navigate` to same-origin paths (kills the widget) and
7
- * reading the JS bundle for credentials. We tell the agent both
8
- * mistakes by name, including the actual origin to forbid.
9
- * 2. Narration format how the widget renders the run depends on the
10
- * agent emitting short imperative one-liners before each logical
11
- * step. The good/bad examples are present-tense and 3–8 words.
4
+ * Principle-first and deliberately short (v0.16 prompt-trim pass). With
5
+ * Opus 4.x, emphatic "do NOT / CRITICAL" rule-stacking over-triggers and the
6
+ * middle of a long prompt gets ignored, so behaviour is steered with a few
7
+ * stated principles each negative carrying its reason rather than an
8
+ * enumerated rule list. Ordering follows attention, not chronology: the
9
+ * highest-value instructions (verify, trust boundary, scope) sit at the top,
10
+ * the volatile tab snapshot at the very bottom.
12
11
  *
13
12
  * Lives in its own file because this string is the most-tuned text in the
14
- * repo and the easiest to break with a typo. Tests can import directly.
13
+ * repo and the easiest to break with a typo. Tests import it directly.
15
14
  *
16
- * Two-tier split (since v0.4.x perf pass):
17
- * - `buildCdpHint(tabs)` returns the full rules + narration block.
18
- * Used on the *first* turn of a session (no `--resume`).
19
- * - `buildCdpHintResume(tabs)` returns ONLY the volatile tab list +
20
- * active-origin guard. Used on subsequent turns once `--resume`
21
- * re-anchors the agent to the prior turn's full system prompt
22
- * the stable rules are already in context, so re-sending them
23
- * fragments Anthropic's prompt cache and bills ~500 extra input
24
- * tokens per turn for zero behavioural change.
15
+ * Two-tier split (prompt-cache aware):
16
+ * - `buildCdpHint(tabs)`: the full block. First turn of a session (no
17
+ * `--resume`).
18
+ * - `buildCdpHintResume(tabs)`: ONLY the volatile tab list — the rules
19
+ * persist in the agent's context from turn 1. Re-sending the stable rules
20
+ * each turn would fragment Anthropic's prompt cache and bill ~500 extra
21
+ * input tokens per turn for zero behavioural change.
25
22
  */
26
23
  interface Tab {
27
24
  url: string;
@@ -32,16 +29,12 @@ export declare function buildCdpHint(tabs: Tab[]): string;
32
29
  * Volatile-only hint for `--resume` turns: just the tab list snapshot.
33
30
  * Empty string when the tab list is empty (nothing to refresh).
34
31
  *
35
- * The rules and narration format from `buildCdpHint` are already
36
- * established in the prior turn's context; re-sending them here would
37
- * fragment Anthropic's prompt-cache fingerprint (cache hits require the
38
- * system prompt to match byte-for-byte across turns) and bill ~500
39
- * extra input tokens per follow-up turn for no behaviour change.
40
- *
41
- * We DO re-send the tab list because it can drift between turns (user
42
- * opens a second tab, switches focus). The active-origin nav-guard is
43
- * not repeated — the agent has it from turn 1 and the tab-list update
44
- * keeps it grounded in the current URL.
32
+ * The rules and narration format from `buildCdpHint` are already established
33
+ * in the prior turn's context; re-sending them here would fragment Anthropic's
34
+ * prompt-cache fingerprint (cache hits require the system prompt to match
35
+ * byte-for-byte across turns) and bill ~500 extra input tokens per follow-up
36
+ * turn for no behaviour change. We DO re-send the tab list because it drifts
37
+ * between turns (user opens a second tab, switches focus).
45
38
  */
46
39
  export declare function buildCdpHintResume(tabs: Tab[]): string;
47
40
  export {};
@@ -1 +1 @@
1
- {"version":3,"file":"cdpHint.d.ts","sourceRoot":"","sources":["../../src/service/cdpHint.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,UAAU,GAAG;IAAG,GAAG,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAA;CAAE;AAa7C,wBAAgB,YAAY,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,MAAM,CAmJhD;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,MAAM,CAYtD"}
1
+ {"version":3,"file":"cdpHint.d.ts","sourceRoot":"","sources":["../../src/service/cdpHint.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAEH,UAAU,GAAG;IAAG,GAAG,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAA;CAAE;AAa7C,wBAAgB,YAAY,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,MAAM,CAgGhD;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,GAAG,EAAE,GAAG,MAAM,CAYtD"}