libretto 0.4.4 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. package/README.md +106 -36
  2. package/dist/cli/cli.js +39 -113
  3. package/dist/cli/commands/ai.js +1 -1
  4. package/dist/cli/commands/browser.js +87 -60
  5. package/dist/cli/commands/execution.js +201 -88
  6. package/dist/cli/commands/init.js +30 -8
  7. package/dist/cli/commands/logs.js +5 -6
  8. package/dist/cli/commands/shared.js +30 -29
  9. package/dist/cli/commands/snapshot.js +26 -39
  10. package/dist/cli/core/ai-config.js +9 -2
  11. package/dist/cli/core/api-snapshot-analyzer.js +15 -5
  12. package/dist/cli/core/browser.js +141 -33
  13. package/dist/cli/core/context.js +7 -18
  14. package/dist/cli/core/session-telemetry.js +5 -2
  15. package/dist/cli/core/session.js +23 -10
  16. package/dist/cli/core/snapshot-analyzer.js +16 -33
  17. package/dist/cli/core/snapshot-api-config.js +2 -6
  18. package/dist/cli/core/telemetry.js +10 -2
  19. package/dist/cli/framework/simple-cli.js +45 -25
  20. package/dist/cli/router.js +14 -21
  21. package/dist/cli/workers/run-integration-runtime.js +26 -7
  22. package/dist/cli/workers/run-integration-worker-protocol.js +3 -1
  23. package/dist/cli/workers/run-integration-worker.js +1 -4
  24. package/dist/index.d.ts +1 -2
  25. package/dist/index.js +7 -10
  26. package/dist/runtime/download/download.js +5 -1
  27. package/dist/runtime/extract/extract.js +11 -2
  28. package/dist/runtime/network/network.js +8 -1
  29. package/dist/runtime/recovery/agent.js +6 -2
  30. package/dist/runtime/recovery/errors.js +3 -1
  31. package/dist/runtime/recovery/recovery.js +3 -1
  32. package/dist/shared/condense-dom/condense-dom.js +6 -13
  33. package/dist/shared/config/config.d.ts +1 -9
  34. package/dist/shared/config/config.js +0 -18
  35. package/dist/shared/config/index.d.ts +2 -1
  36. package/dist/shared/config/index.js +0 -10
  37. package/dist/shared/debug/pause.js +9 -3
  38. package/dist/shared/instrumentation/instrument.js +101 -5
  39. package/dist/shared/llm/ai-sdk-adapter.js +3 -1
  40. package/dist/shared/llm/client.js +3 -1
  41. package/dist/shared/logger/index.js +4 -1
  42. package/dist/shared/paths/paths.js +2 -1
  43. package/dist/shared/paths/repo-root.d.ts +3 -0
  44. package/dist/shared/paths/repo-root.js +24 -0
  45. package/dist/shared/run/api.js +3 -1
  46. package/dist/shared/run/browser.js +7 -2
  47. package/dist/shared/state/session-state.d.ts +2 -1
  48. package/dist/shared/state/session-state.js +5 -2
  49. package/dist/shared/visualization/ghost-cursor.js +19 -10
  50. package/dist/shared/visualization/highlight.js +9 -6
  51. package/dist/shared/workflow/workflow.d.ts +4 -5
  52. package/dist/shared/workflow/workflow.js +3 -5
  53. package/package.json +11 -8
  54. package/scripts/check-skills-sync.mjs +25 -0
  55. package/scripts/compare-eval-summary.mjs +47 -0
  56. package/scripts/postinstall.mjs +26 -17
  57. package/scripts/prepare-release.sh +97 -0
  58. package/scripts/skills-libretto.mjs +103 -0
  59. package/scripts/summarize-evals.mjs +135 -0
  60. package/scripts/sync-skills.mjs +12 -0
  61. package/skills/libretto/SKILL.md +130 -377
  62. package/skills/libretto/references/auth-profiles.md +30 -0
  63. package/skills/libretto/{code-generation-rules.md → references/code-generation-rules.md} +27 -42
  64. package/skills/libretto/references/configuration-file-reference.md +53 -0
  65. package/skills/libretto/references/pages-and-page-targeting.md +29 -0
  66. package/skills/libretto/references/site-security-review.md +143 -0
  67. package/src/cli/cli.ts +86 -0
  68. package/src/cli/commands/ai.ts +35 -0
  69. package/src/cli/commands/browser.ts +189 -0
  70. package/src/cli/commands/execution.ts +822 -0
  71. package/src/cli/commands/init.ts +350 -0
  72. package/src/cli/commands/logs.ts +128 -0
  73. package/src/cli/commands/shared.ts +69 -0
  74. package/src/cli/commands/snapshot.ts +312 -0
  75. package/src/cli/core/ai-config.ts +264 -0
  76. package/src/cli/core/api-snapshot-analyzer.ts +108 -0
  77. package/src/cli/core/browser.ts +976 -0
  78. package/src/cli/core/context.ts +127 -0
  79. package/src/cli/core/pause-signals.ts +35 -0
  80. package/src/cli/core/session-telemetry.ts +564 -0
  81. package/src/cli/core/session.ts +223 -0
  82. package/src/cli/core/snapshot-analyzer.ts +855 -0
  83. package/src/cli/core/snapshot-api-config.ts +231 -0
  84. package/src/cli/core/telemetry.ts +459 -0
  85. package/src/cli/framework/simple-cli.ts +1340 -0
  86. package/src/cli/index.ts +13 -0
  87. package/src/cli/router.ts +20 -0
  88. package/src/cli/workers/run-integration-runtime.ts +338 -0
  89. package/src/cli/workers/run-integration-worker-protocol.ts +16 -0
  90. package/src/cli/workers/run-integration-worker.ts +72 -0
  91. package/src/index.ts +127 -0
  92. package/src/runtime/download/download.ts +104 -0
  93. package/src/runtime/download/index.ts +7 -0
  94. package/src/runtime/extract/extract.ts +102 -0
  95. package/src/runtime/extract/index.ts +1 -0
  96. package/src/runtime/network/index.ts +5 -0
  97. package/src/runtime/network/network.ts +119 -0
  98. package/{dist/runtime/recovery/agent.cjs → src/runtime/recovery/agent.ts} +114 -76
  99. package/src/runtime/recovery/errors.ts +155 -0
  100. package/src/runtime/recovery/index.ts +7 -0
  101. package/src/runtime/recovery/recovery.ts +53 -0
  102. package/{dist/shared/condense-dom/condense-dom.cjs → src/shared/condense-dom/condense-dom.ts} +249 -124
  103. package/src/shared/config/config.ts +3 -0
  104. package/src/shared/config/index.ts +0 -0
  105. package/src/shared/debug/index.ts +1 -0
  106. package/src/shared/debug/pause.ts +91 -0
  107. package/src/shared/instrumentation/errors.ts +84 -0
  108. package/src/shared/instrumentation/index.ts +9 -0
  109. package/src/shared/instrumentation/instrument.ts +406 -0
  110. package/src/shared/llm/ai-sdk-adapter.ts +81 -0
  111. package/{dist/shared/llm/client.cjs → src/shared/llm/client.ts} +86 -80
  112. package/src/shared/llm/index.ts +3 -0
  113. package/src/shared/llm/types.ts +63 -0
  114. package/src/shared/logger/index.ts +13 -0
  115. package/src/shared/logger/logger.ts +358 -0
  116. package/src/shared/logger/sinks.ts +148 -0
  117. package/src/shared/paths/paths.ts +110 -0
  118. package/src/shared/paths/repo-root.ts +27 -0
  119. package/src/shared/run/api.ts +6 -0
  120. package/src/shared/run/browser.ts +107 -0
  121. package/src/shared/state/index.ts +11 -0
  122. package/src/shared/state/session-state.ts +77 -0
  123. package/src/shared/visualization/ghost-cursor.ts +213 -0
  124. package/src/shared/visualization/highlight.ts +149 -0
  125. package/src/shared/visualization/index.ts +18 -0
  126. package/src/shared/workflow/workflow.ts +36 -0
  127. package/dist/index.cjs +0 -144
  128. package/dist/index.d.cts +0 -21
  129. package/dist/runtime/download/download.cjs +0 -70
  130. package/dist/runtime/download/download.d.cts +0 -35
  131. package/dist/runtime/download/index.cjs +0 -30
  132. package/dist/runtime/download/index.d.cts +0 -3
  133. package/dist/runtime/extract/extract.cjs +0 -88
  134. package/dist/runtime/extract/extract.d.cts +0 -23
  135. package/dist/runtime/extract/index.cjs +0 -28
  136. package/dist/runtime/extract/index.d.cts +0 -5
  137. package/dist/runtime/network/index.cjs +0 -28
  138. package/dist/runtime/network/index.d.cts +0 -4
  139. package/dist/runtime/network/network.cjs +0 -91
  140. package/dist/runtime/network/network.d.cts +0 -28
  141. package/dist/runtime/recovery/agent.d.cts +0 -13
  142. package/dist/runtime/recovery/errors.cjs +0 -124
  143. package/dist/runtime/recovery/errors.d.cts +0 -31
  144. package/dist/runtime/recovery/index.cjs +0 -34
  145. package/dist/runtime/recovery/index.d.cts +0 -7
  146. package/dist/runtime/recovery/recovery.cjs +0 -55
  147. package/dist/runtime/recovery/recovery.d.cts +0 -12
  148. package/dist/shared/condense-dom/condense-dom.d.cts +0 -34
  149. package/dist/shared/config/config.cjs +0 -44
  150. package/dist/shared/config/config.d.cts +0 -10
  151. package/dist/shared/config/index.cjs +0 -32
  152. package/dist/shared/config/index.d.cts +0 -1
  153. package/dist/shared/debug/index.cjs +0 -28
  154. package/dist/shared/debug/index.d.cts +0 -1
  155. package/dist/shared/debug/pause.cjs +0 -86
  156. package/dist/shared/debug/pause.d.cts +0 -12
  157. package/dist/shared/instrumentation/errors.cjs +0 -81
  158. package/dist/shared/instrumentation/errors.d.cts +0 -12
  159. package/dist/shared/instrumentation/index.cjs +0 -35
  160. package/dist/shared/instrumentation/index.d.cts +0 -6
  161. package/dist/shared/instrumentation/instrument.cjs +0 -206
  162. package/dist/shared/instrumentation/instrument.d.cts +0 -32
  163. package/dist/shared/llm/ai-sdk-adapter.cjs +0 -71
  164. package/dist/shared/llm/ai-sdk-adapter.d.cts +0 -22
  165. package/dist/shared/llm/client.d.cts +0 -13
  166. package/dist/shared/llm/index.cjs +0 -31
  167. package/dist/shared/llm/index.d.cts +0 -5
  168. package/dist/shared/llm/types.cjs +0 -16
  169. package/dist/shared/llm/types.d.cts +0 -67
  170. package/dist/shared/logger/index.cjs +0 -37
  171. package/dist/shared/logger/index.d.cts +0 -2
  172. package/dist/shared/logger/logger.cjs +0 -232
  173. package/dist/shared/logger/logger.d.cts +0 -86
  174. package/dist/shared/logger/sinks.cjs +0 -160
  175. package/dist/shared/logger/sinks.d.cts +0 -9
  176. package/dist/shared/paths/paths.cjs +0 -104
  177. package/dist/shared/paths/paths.d.cts +0 -10
  178. package/dist/shared/run/api.cjs +0 -28
  179. package/dist/shared/run/api.d.cts +0 -2
  180. package/dist/shared/run/browser.cjs +0 -98
  181. package/dist/shared/run/browser.d.cts +0 -22
  182. package/dist/shared/state/index.cjs +0 -38
  183. package/dist/shared/state/index.d.cts +0 -2
  184. package/dist/shared/state/session-state.cjs +0 -92
  185. package/dist/shared/state/session-state.d.cts +0 -40
  186. package/dist/shared/visualization/ghost-cursor.cjs +0 -174
  187. package/dist/shared/visualization/ghost-cursor.d.cts +0 -37
  188. package/dist/shared/visualization/highlight.cjs +0 -134
  189. package/dist/shared/visualization/highlight.d.cts +0 -22
  190. package/dist/shared/visualization/index.cjs +0 -45
  191. package/dist/shared/visualization/index.d.cts +0 -3
  192. package/dist/shared/workflow/workflow.cjs +0 -47
  193. package/dist/shared/workflow/workflow.d.cts +0 -21
  194. package/skills/libretto/integration-approach-selection.md +0 -174
@@ -0,0 +1,102 @@
1
+ import type { Page } from "playwright";
2
+ import type z from "zod";
3
+ import {
4
+ type MinimalLogger,
5
+ defaultLogger,
6
+ } from "../../shared/logger/logger.js";
7
+ import type { LLMClient } from "../../shared/llm/types.js";
8
+
9
+ export type ExtractOptions<T extends z.ZodType> = {
10
+ page: Page;
11
+ instruction: string;
12
+ schema: T;
13
+ llmClient: LLMClient;
14
+ logger?: MinimalLogger;
15
+ /** Optional CSS selector to scope extraction to a specific element. */
16
+ selector?: string;
17
+ };
18
+
19
+ /**
20
+ * Generic AI-powered data extraction from page elements.
21
+ * Takes a screenshot (full-page via CDP or scoped to an element),
22
+ * captures DOM content, and uses an LLM to extract structured data
23
+ * matching the provided Zod schema.
24
+ */
25
+ export async function extractFromPage<T extends z.ZodType>(
26
+ options: ExtractOptions<T>,
27
+ ): Promise<z.infer<T>> {
28
+ const {
29
+ page,
30
+ instruction,
31
+ schema,
32
+ selector,
33
+ logger = defaultLogger,
34
+ llmClient,
35
+ } = options;
36
+
37
+ let screenshot: string;
38
+ let domContent: string | undefined;
39
+
40
+ if (selector) {
41
+ const element = page.locator(selector);
42
+ await element.waitFor({ state: "visible", timeout: 10_000 });
43
+
44
+ const screenshotBuffer = await element.screenshot();
45
+ screenshot = screenshotBuffer.toString("base64");
46
+
47
+ try {
48
+ domContent = await element.innerHTML();
49
+ if (domContent.length > 30000) {
50
+ domContent = domContent.slice(0, 30000) + "\n... [truncated]";
51
+ }
52
+ } catch {
53
+ domContent = undefined;
54
+ }
55
+ } else {
56
+ const cdpClient = await page.context().newCDPSession(page);
57
+ await cdpClient.send("Page.enable");
58
+ const { data } = await cdpClient.send("Page.captureScreenshot", {
59
+ format: "png",
60
+ });
61
+ screenshot = data;
62
+
63
+ try {
64
+ const htmlContent = await page.content();
65
+ domContent =
66
+ htmlContent.length > 50000
67
+ ? htmlContent.slice(0, 50000) + "\n... [truncated]"
68
+ : htmlContent;
69
+ } catch {
70
+ domContent = undefined;
71
+ }
72
+ }
73
+
74
+ const prompt = `You are analyzing a screenshot${selector ? " of a specific element" : ""} from a web page to extract structured data.
75
+
76
+ Instruction: ${instruction}
77
+
78
+ ${domContent ? `Here is the HTML content for additional context:\n<html>\n${domContent}\n</html>` : ""}
79
+
80
+ Extract the requested information from the screenshot and return it in the specified format. Be precise and only extract what is visible.`;
81
+
82
+ const result = await llmClient.generateObjectFromMessages({
83
+ schema,
84
+ messages: [
85
+ {
86
+ role: "user",
87
+ content: [
88
+ { type: "text", text: prompt },
89
+ { type: "image", image: `data:image/png;base64,${screenshot}` },
90
+ ],
91
+ },
92
+ ],
93
+ temperature: 0,
94
+ });
95
+
96
+ logger.info("extractFromPage completed", {
97
+ selector,
98
+ instruction: instruction.slice(0, 100),
99
+ });
100
+
101
+ return result;
102
+ }
@@ -0,0 +1 @@
1
+ export { extractFromPage, type ExtractOptions } from "./extract.js";
@@ -0,0 +1,5 @@
1
+ export {
2
+ pageRequest,
3
+ type RequestConfig,
4
+ type PageRequestOptions,
5
+ } from "./network.js";
@@ -0,0 +1,119 @@
1
+ import type { Page } from "playwright";
2
+ import type z from "zod";
3
+ import type { MinimalLogger } from "../../shared/logger/logger.js";
4
+
5
+ export type RequestConfig = {
6
+ url: string;
7
+ method?: "GET" | "POST" | "PUT" | "DELETE" | "PATCH";
8
+ headers?: Record<string, string>;
9
+ body?: Record<string, any> | string;
10
+ /** How to serialize the body. Defaults to "json". */
11
+ bodyType?: "json" | "form";
12
+ /** How to parse the response. Defaults to "json". */
13
+ responseType?: "json" | "text" | "xml";
14
+ };
15
+
16
+ export type PageRequestOptions<T extends z.ZodType | undefined = undefined> = {
17
+ logger?: MinimalLogger;
18
+ /** Optional Zod schema to validate the response body. */
19
+ schema?: T;
20
+ };
21
+
22
+ type PageRequestResult<T extends z.ZodType | undefined> = T extends z.ZodType
23
+ ? z.infer<T>
24
+ : any;
25
+
26
+ /**
27
+ * Executes a fetch() call inside the browser context via page.evaluate().
28
+ * Provides typed request config, automatic response parsing, optional Zod
29
+ * validation, and logging.
30
+ */
31
+ export async function pageRequest<T extends z.ZodType | undefined = undefined>(
32
+ page: Page,
33
+ config: RequestConfig,
34
+ options?: PageRequestOptions<T>,
35
+ ): Promise<PageRequestResult<T>> {
36
+ const {
37
+ url,
38
+ method = "GET",
39
+ headers = {},
40
+ body,
41
+ bodyType = "json",
42
+ responseType = "json",
43
+ } = config;
44
+ const { logger, schema } = options ?? {};
45
+
46
+ const startTime = Date.now();
47
+
48
+ // Build fetch options to pass into page.evaluate
49
+ const fetchHeaders: Record<string, string> = { ...headers };
50
+ let fetchBody: string | undefined;
51
+
52
+ if (body !== undefined) {
53
+ if (bodyType === "form") {
54
+ fetchHeaders["Content-Type"] = "application/x-www-form-urlencoded";
55
+ if (typeof body === "string") {
56
+ fetchBody = body;
57
+ } else {
58
+ fetchBody = new URLSearchParams(
59
+ Object.entries(body).map(([k, v]) => [k, String(v)]),
60
+ ).toString();
61
+ }
62
+ } else {
63
+ fetchHeaders["Content-Type"] = "application/json";
64
+ fetchBody = typeof body === "string" ? body : JSON.stringify(body);
65
+ }
66
+ }
67
+
68
+ const result = await page.evaluate(
69
+ async ({ url, method, headers, body, responseType }) => {
70
+ const res = await fetch(url, {
71
+ method,
72
+ headers,
73
+ body: body ?? undefined,
74
+ });
75
+
76
+ const status = res.status;
77
+ const ok = res.ok;
78
+ let data: any;
79
+
80
+ if (responseType === "json") {
81
+ data = await res.json();
82
+ } else {
83
+ data = await res.text();
84
+ }
85
+
86
+ return { status, ok, data };
87
+ },
88
+ { url, method, headers: fetchHeaders, body: fetchBody, responseType },
89
+ );
90
+
91
+ const duration = Date.now() - startTime;
92
+
93
+ if (!result.ok) {
94
+ logger?.warn("network:request:error", {
95
+ method,
96
+ url,
97
+ status: result.status,
98
+ duration,
99
+ body:
100
+ typeof result.data === "string" ? result.data.slice(0, 500) : undefined,
101
+ });
102
+ throw new Error(
103
+ `pageRequest failed: ${method} ${url} returned ${result.status}`,
104
+ );
105
+ }
106
+
107
+ logger?.info("network:request", {
108
+ method,
109
+ url,
110
+ status: result.status,
111
+ duration,
112
+ });
113
+
114
+ if (schema) {
115
+ return schema.parse(result.data) as PageRequestResult<T>;
116
+ }
117
+
118
+ return result.data as PageRequestResult<T>;
119
+ }
@@ -1,32 +1,33 @@
1
- "use strict";
2
- var __defProp = Object.defineProperty;
3
- var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
- var __getOwnPropNames = Object.getOwnPropertyNames;
5
- var __hasOwnProp = Object.prototype.hasOwnProperty;
6
- var __export = (target, all) => {
7
- for (var name in all)
8
- __defProp(target, name, { get: all[name], enumerable: true });
9
- };
10
- var __copyProps = (to, from, except, desc) => {
11
- if (from && typeof from === "object" || typeof from === "function") {
12
- for (let key of __getOwnPropNames(from))
13
- if (!__hasOwnProp.call(to, key) && key !== except)
14
- __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
- }
16
- return to;
17
- };
18
- var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
- var agent_exports = {};
20
- __export(agent_exports, {
21
- executeRecoveryAgent: () => executeRecoveryAgent
22
- });
23
- module.exports = __toCommonJS(agent_exports);
24
- var import_logger = require("../../shared/logger/logger.js");
25
- var import_zod = require("zod");
26
- function delay(ms) {
1
+ import type { Page } from "playwright";
2
+ import {
3
+ type MinimalLogger,
4
+ defaultLogger,
5
+ } from "../../shared/logger/logger.js";
6
+ import type { LLMClient } from "../../shared/llm/types.js";
7
+
8
+ type BrowserAction =
9
+ | { type: "click"; x: number; y: number; button?: string }
10
+ | { type: "double_click"; x: number; y: number }
11
+ | {
12
+ type: "scroll";
13
+ x: number;
14
+ y: number;
15
+ scroll_x: number;
16
+ scroll_y: number;
17
+ }
18
+ | { type: "keypress"; keys: string[] }
19
+ | { type: "type"; text: string }
20
+ | { type: "wait" }
21
+ | { type: "screenshot" }
22
+ | { type: "drag"; path: { x: number; y: number }[] }
23
+ | { type: "move"; x: number; y: number }
24
+ | { type: "done" };
25
+
26
+ function delay(ms: number): Promise<void> {
27
27
  return new Promise((resolve) => setTimeout(resolve, ms));
28
28
  }
29
- const KEY_MAPPINGS = {
29
+
30
+ const KEY_MAPPINGS: Record<string, string> = {
30
31
  ENTER: "Enter",
31
32
  RETURN: "Enter",
32
33
  TAB: "Tab",
@@ -49,16 +50,25 @@ const KEY_MAPPINGS = {
49
50
  SHIFT: "Shift",
50
51
  META: "Meta",
51
52
  CMD: "Meta",
52
- COMMAND: "Meta"
53
+ COMMAND: "Meta",
53
54
  };
54
- function mapKeyName(key) {
55
+
56
+ function mapKeyName(key: string): string {
55
57
  return KEY_MAPPINGS[key.toUpperCase()] ?? key;
56
58
  }
57
- async function executeBrowserAction(page, action, logger = import_logger.defaultLogger) {
59
+
60
+ async function executeBrowserAction(
61
+ page: Page,
62
+ action: BrowserAction,
63
+ logger: MinimalLogger = defaultLogger,
64
+ ): Promise<void> {
58
65
  switch (action.type) {
59
66
  case "click": {
60
67
  const { x, y, button = "left" } = action;
61
- const playwrightButton = button === "wheel" || button === "back" || button === "forward" ? "left" : button;
68
+ const playwrightButton =
69
+ button === "wheel" || button === "back" || button === "forward"
70
+ ? ("left" as const)
71
+ : (button as "left" | "right" | "middle");
62
72
  await page.mouse.click(x, y, { button: playwrightButton });
63
73
  logger.info(`Clicked at (${x}, ${y}) with ${button} button`);
64
74
  break;
@@ -90,7 +100,7 @@ async function executeBrowserAction(page, action, logger = import_logger.default
90
100
  break;
91
101
  }
92
102
  case "wait": {
93
- await delay(2e3);
103
+ await delay(2000);
94
104
  logger.info("Waited 2 seconds");
95
105
  break;
96
106
  }
@@ -110,7 +120,9 @@ async function executeBrowserAction(page, action, logger = import_logger.default
110
120
  if (point) await page.mouse.move(point.x, point.y);
111
121
  }
112
122
  await page.mouse.up();
113
- logger.info(`Dragged from (${start.x}, ${start.y}) to (${end.x}, ${end.y})`);
123
+ logger.info(
124
+ `Dragged from (${start.x}, ${start.y}) to (${end.x}, ${end.y})`,
125
+ );
114
126
  }
115
127
  break;
116
128
  }
@@ -125,56 +137,80 @@ async function executeBrowserAction(page, action, logger = import_logger.default
125
137
  }
126
138
  }
127
139
  }
128
- const recoveryActionSchema = import_zod.z.object({
129
- reasoning: import_zod.z.string().describe("Your reasoning about what you see and what action to take"),
130
- action: import_zod.z.discriminatedUnion("type", [
131
- import_zod.z.object({
132
- type: import_zod.z.literal("click"),
133
- x: import_zod.z.number(),
134
- y: import_zod.z.number()
140
+
141
+ import { z } from "zod";
142
+
143
+ const recoveryActionSchema = z.object({
144
+ reasoning: z
145
+ .string()
146
+ .describe("Your reasoning about what you see and what action to take"),
147
+ action: z.discriminatedUnion("type", [
148
+ z.object({
149
+ type: z.literal("click"),
150
+ x: z.number(),
151
+ y: z.number(),
152
+ }),
153
+ z.object({
154
+ type: z.literal("type"),
155
+ text: z.string(),
135
156
  }),
136
- import_zod.z.object({
137
- type: import_zod.z.literal("type"),
138
- text: import_zod.z.string()
157
+ z.object({
158
+ type: z.literal("keypress"),
159
+ keys: z.array(z.string()),
139
160
  }),
140
- import_zod.z.object({
141
- type: import_zod.z.literal("keypress"),
142
- keys: import_zod.z.array(import_zod.z.string())
161
+ z.object({
162
+ type: z.literal("scroll"),
163
+ x: z.number(),
164
+ y: z.number(),
165
+ scroll_x: z.number(),
166
+ scroll_y: z.number(),
143
167
  }),
144
- import_zod.z.object({
145
- type: import_zod.z.literal("scroll"),
146
- x: import_zod.z.number(),
147
- y: import_zod.z.number(),
148
- scroll_x: import_zod.z.number(),
149
- scroll_y: import_zod.z.number()
168
+ z.object({
169
+ type: z.literal("wait"),
150
170
  }),
151
- import_zod.z.object({
152
- type: import_zod.z.literal("wait")
171
+ z.object({
172
+ type: z.literal("done"),
153
173
  }),
154
- import_zod.z.object({
155
- type: import_zod.z.literal("done")
156
- })
157
- ])
174
+ ]),
158
175
  });
159
- async function executeRecoveryAgent(page, instruction, logger, llmClient) {
176
+
177
+ /**
178
+ * Executes a vision-based recovery agent to recover from browser automation failures.
179
+ * Takes a screenshot, sends it to the LLM with the instruction, and executes
180
+ * the LLM's suggested browser actions.
181
+ */
182
+ export async function executeRecoveryAgent(
183
+ page: Page,
184
+ instruction: string,
185
+ logger?: MinimalLogger,
186
+ llmClient?: LLMClient,
187
+ ): Promise<void> {
160
188
  if (!llmClient) {
161
189
  return;
162
190
  }
163
- const log = logger ?? import_logger.defaultLogger;
191
+ const log = logger ?? defaultLogger;
164
192
  log.info("Executing vision-based recovery agent", { instruction });
193
+
165
194
  const viewport = page.viewportSize();
166
195
  if (!viewport) {
167
196
  throw new Error("Viewport size not found");
168
197
  }
169
- let screenshot;
198
+
199
+ let screenshot: string;
170
200
  try {
171
- screenshot = (await page.screenshot({ fullPage: false, timeout: 1e4 })).toString("base64");
201
+ screenshot = (
202
+ await page.screenshot({ fullPage: false, timeout: 10000 })
203
+ ).toString("base64");
172
204
  } catch (screenshotError) {
173
205
  log.warn("Failed to take screenshot for recovery agent, skipping", {
174
- screenshotError: screenshotError instanceof Error ? screenshotError.message : String(screenshotError)
206
+ screenshotError:
207
+ screenshotError instanceof Error
208
+ ? screenshotError.message
209
+ : String(screenshotError),
175
210
  });
176
211
  throw new Error("Failed to take screenshot for recovery agent");
177
212
  }
213
+
178
214
  const maxSteps = 3;
179
215
  for (let step = 1; step <= maxSteps; step++) {
180
216
  const result = await llmClient.generateObjectFromMessages({
@@ -190,34 +226,36 @@ async function executeRecoveryAgent(page, instruction, logger, llmClient) {
190
226
  Your task: ${instruction}
191
227
 
192
228
  Viewport: ${viewport.width}x${viewport.height}px. Complete this in as few steps as possible.
193
- Analyze the screenshot and decide what action to take. If the task is complete or no action is needed, use the "done" action type.`
229
+ Analyze the screenshot and decide what action to take. If the task is complete or no action is needed, use the "done" action type.`,
194
230
  },
195
231
  {
196
232
  type: "image",
197
- image: `data:image/png;base64,${screenshot}`
198
- }
199
- ]
200
- }
233
+ image: `data:image/png;base64,${screenshot}`,
234
+ },
235
+ ],
236
+ },
201
237
  ],
202
- temperature: 0
238
+ temperature: 0,
203
239
  });
240
+
204
241
  log.info(`Recovery step ${step}/${maxSteps}`, {
205
242
  reasoning: result.reasoning,
206
- action: result.action
243
+ action: result.action,
207
244
  });
245
+
208
246
  if (result.action.type === "done") {
209
247
  log.info("Recovery agent completed - no more actions needed");
210
248
  break;
211
249
  }
250
+
212
251
  await executeBrowserAction(page, result.action, log);
213
- await delay(2e3);
252
+ await delay(2000);
253
+
254
+ // Take new screenshot for next iteration
214
255
  screenshot = (await page.screenshot({ fullPage: false })).toString(
215
- "base64"
256
+ "base64",
216
257
  );
217
258
  }
259
+
218
260
  log.info("Recovery agent execution completed");
219
261
  }
220
- // Annotate the CommonJS export names for ESM import in node:
221
- 0 && (module.exports = {
222
- executeRecoveryAgent
223
- });
@@ -0,0 +1,155 @@
1
+ import type { Page } from "playwright";
2
+ import {
3
+ type MinimalLogger,
4
+ defaultLogger,
5
+ } from "../../shared/logger/logger.js";
6
+ import type { LLMClient } from "../../shared/llm/types.js";
7
+ import { z } from "zod";
8
+
9
+ /**
10
+ * Known error type for classifying submission errors.
11
+ * errorPatterns are what the LLM should look for on screen.
12
+ * userMessage is the friendly message returned when matched.
13
+ */
14
+ export type KnownSubmissionError = {
15
+ id: string;
16
+ errorPatterns: string[];
17
+ userMessage: string;
18
+ };
19
+
20
+ export type DetectedSubmissionError = {
21
+ matched: true;
22
+ errorId: string;
23
+ message: string;
24
+ };
25
+
26
+ const detectSubmissionErrorSchema = z.object({
27
+ hasError: z.boolean().describe("Whether an error is visible on the page"),
28
+ matchedKnownErrorId: z
29
+ .string()
30
+ .nullable()
31
+ .describe("The ID of the matched known error, or null if no match"),
32
+ errorMessage: z
33
+ .string()
34
+ .nullable()
35
+ .describe("The error message visible on screen, or null if no error"),
36
+ });
37
+
38
+ /**
39
+ * Uses screenshot + LLM vision to detect if an error occurred during a submission process.
40
+ * Captures a screenshot via CDP (handles unresponsive pages), sends it to the LLM,
41
+ * and checks against the provided known error patterns.
42
+ *
43
+ * @returns DetectedSubmissionError if a known error is matched
44
+ * @throws The original error if no known error matches
45
+ */
46
+ export async function detectSubmissionError(
47
+ page: Page,
48
+ error: unknown,
49
+ logContext: string,
50
+ llmClient: LLMClient,
51
+ knownErrors: KnownSubmissionError[] = [],
52
+ logger?: MinimalLogger,
53
+ ): Promise<DetectedSubmissionError> {
54
+ const log = logger ?? defaultLogger;
55
+ // Capture screenshot using CDP to handle unresponsive pages
56
+ let screenshot: string;
57
+ let domSnapshot: string | undefined;
58
+
59
+ try {
60
+ const cdpClient = await page.context().newCDPSession(page);
61
+ await cdpClient.send("Page.enable");
62
+ const { data } = await cdpClient.send("Page.captureScreenshot", {
63
+ format: "png",
64
+ });
65
+ screenshot = data;
66
+ } catch (screenshotError) {
67
+ log.warn(
68
+ "Failed to take screenshot via CDP for error detection, skipping LLM analysis",
69
+ { screenshotError, originalError: error },
70
+ );
71
+ throw error;
72
+ }
73
+
74
+ // Capture DOM snapshot for additional context
75
+ try {
76
+ const htmlContent = await page.content();
77
+ domSnapshot =
78
+ htmlContent.length > 50000
79
+ ? htmlContent.slice(0, 50000) + "\n... [truncated]"
80
+ : htmlContent;
81
+ } catch (domError) {
82
+ log.warn("Failed to capture DOM snapshot", {
83
+ domError: domError instanceof Error ? domError.message : String(domError),
84
+ });
85
+ }
86
+
87
+ const knownErrorsDescription =
88
+ knownErrors.length > 0
89
+ ? `\nKnown error patterns to look for:\n${knownErrors.map((e, i) => `${i + 1}. ID: "${e.id}" - Patterns: ${e.errorPatterns.join(", ")}`).join("\n")}\n`
90
+ : "";
91
+
92
+ const prompt = `You are analyzing a screenshot and DOM of a web page to detect if an error occurred during a browser automation process.
93
+
94
+ Context: ${logContext}
95
+
96
+ ${knownErrorsDescription}
97
+
98
+ Analyze the screenshot and DOM snapshot to determine:
99
+ 1. Is there any error message, warning, or indication of failure visible on the page?
100
+ 2. If yes, does it match any of the known error patterns listed above?
101
+ 3. What is the exact error message or description of the problem?
102
+
103
+ IMPORTANT:
104
+ - Look carefully for error alerts, warning banners, error modals, red text, or any indication of failure
105
+ - Check the DOM snapshot for error messages that may not be visible in the screenshot
106
+ - If you see a known error pattern, use its exact ID in matchedKnownErrorId
107
+ - If there's an error but it doesn't match any known pattern, set matchedKnownErrorId to null
108
+ - If the page looks normal with no errors, set hasError to false
109
+
110
+ ${domSnapshot ? `<dom_snapshot>\n${domSnapshot}\n</dom_snapshot>` : ""}`;
111
+
112
+ const result = await llmClient.generateObjectFromMessages({
113
+ schema: detectSubmissionErrorSchema,
114
+ messages: [
115
+ {
116
+ role: "user",
117
+ content: [
118
+ { type: "text", text: prompt },
119
+ { type: "image", image: `data:image/png;base64,${screenshot}` },
120
+ ],
121
+ },
122
+ ],
123
+ temperature: 0,
124
+ });
125
+
126
+ if (!result.hasError) {
127
+ log.info("No error detected by LLM", { result });
128
+ }
129
+
130
+ // Check if it matches a known error
131
+ if (result.matchedKnownErrorId) {
132
+ const knownError = knownErrors.find(
133
+ (e) => e.id === result.matchedKnownErrorId,
134
+ );
135
+ if (knownError) {
136
+ log.warn(logContext, {
137
+ error,
138
+ browserError: result.errorMessage,
139
+ knownErrorId: result.matchedKnownErrorId,
140
+ });
141
+ return {
142
+ matched: true,
143
+ errorId: knownError.id,
144
+ message: knownError.userMessage,
145
+ };
146
+ }
147
+ }
148
+
149
+ // Log and re-throw for unknown errors
150
+ log.warn(logContext, {
151
+ error,
152
+ browserError: result.errorMessage,
153
+ });
154
+ throw error;
155
+ }
@@ -0,0 +1,7 @@
1
+ export { executeRecoveryAgent } from "./agent.js";
2
+ export { attemptWithRecovery } from "./recovery.js";
3
+ export {
4
+ detectSubmissionError,
5
+ type KnownSubmissionError,
6
+ type DetectedSubmissionError,
7
+ } from "./errors.js";