libretto 0.4.4 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/cli.js +20 -19
- package/dist/cli/commands/ai.js +1 -1
- package/dist/cli/commands/browser.js +3 -3
- package/dist/cli/commands/execution.js +3 -3
- package/dist/cli/commands/logs.js +1 -1
- package/dist/cli/core/browser.js +11 -6
- package/dist/cli/core/context.js +4 -18
- package/dist/cli/core/session.js +2 -2
- package/dist/cli/core/snapshot-analyzer.js +2 -2
- package/dist/cli/router.js +1 -1
- package/dist/cli/workers/run-integration-runtime.js +2 -2
- package/dist/shared/paths/paths.js +2 -1
- package/dist/shared/paths/repo-root.d.ts +3 -0
- package/dist/shared/paths/repo-root.js +24 -0
- package/package.json +6 -7
- package/scripts/postinstall.mjs +12 -3
- package/skills/libretto/SKILL.md +93 -404
- package/skills/libretto/references/auth-profiles.md +30 -0
- package/skills/libretto/references/pages-and-page-targeting.md +29 -0
- package/skills/libretto/references/reverse-engineering-network-requests.md +39 -0
- package/skills/libretto/references/user-action-log.md +31 -0
- package/src/cli/cli.ts +173 -0
- package/src/cli/commands/ai.ts +35 -0
- package/src/cli/commands/browser.ts +165 -0
- package/src/cli/commands/execution.ts +691 -0
- package/src/cli/commands/init.ts +327 -0
- package/src/cli/commands/logs.ts +128 -0
- package/src/cli/commands/shared.ts +70 -0
- package/src/cli/commands/snapshot.ts +327 -0
- package/src/cli/core/ai-config.ts +255 -0
- package/src/cli/core/api-snapshot-analyzer.ts +97 -0
- package/src/cli/core/browser.ts +839 -0
- package/src/cli/core/context.ts +122 -0
- package/src/cli/core/pause-signals.ts +35 -0
- package/src/cli/core/session-telemetry.ts +553 -0
- package/src/cli/core/session.ts +209 -0
- package/src/cli/core/snapshot-analyzer.ts +875 -0
- package/src/cli/core/snapshot-api-config.ts +236 -0
- package/src/cli/core/telemetry.ts +446 -0
- package/src/cli/framework/simple-cli.ts +1273 -0
- package/src/cli/index.ts +13 -0
- package/src/cli/router.ts +28 -0
- package/src/cli/workers/run-integration-runtime.ts +311 -0
- package/src/cli/workers/run-integration-worker-protocol.ts +14 -0
- package/src/cli/workers/run-integration-worker.ts +75 -0
- package/src/index.ts +120 -0
- package/src/runtime/download/download.ts +100 -0
- package/src/runtime/download/index.ts +7 -0
- package/src/runtime/extract/extract.ts +92 -0
- package/src/runtime/extract/index.ts +1 -0
- package/src/runtime/network/index.ts +5 -0
- package/src/runtime/network/network.ts +113 -0
- package/src/runtime/recovery/agent.ts +256 -0
- package/src/runtime/recovery/errors.ts +152 -0
- package/src/runtime/recovery/index.ts +7 -0
- package/src/runtime/recovery/recovery.ts +50 -0
- package/{dist/shared/condense-dom/condense-dom.cjs → src/shared/condense-dom/condense-dom.ts} +243 -115
- package/src/shared/config/config.ts +22 -0
- package/src/shared/config/index.ts +5 -0
- package/src/shared/debug/index.ts +1 -0
- package/src/shared/debug/pause.ts +85 -0
- package/src/shared/instrumentation/errors.ts +82 -0
- package/src/shared/instrumentation/index.ts +9 -0
- package/src/shared/instrumentation/instrument.ts +276 -0
- package/src/shared/llm/ai-sdk-adapter.ts +78 -0
- package/src/shared/llm/client.ts +217 -0
- package/src/shared/llm/index.ts +3 -0
- package/src/shared/llm/types.ts +63 -0
- package/src/shared/logger/index.ts +6 -0
- package/src/shared/logger/logger.ts +352 -0
- package/src/shared/logger/sinks.ts +144 -0
- package/src/shared/paths/paths.ts +109 -0
- package/src/shared/paths/repo-root.ts +27 -0
- package/src/shared/run/api.ts +2 -0
- package/src/shared/run/browser.ts +98 -0
- package/src/shared/state/index.ts +11 -0
- package/src/shared/state/session-state.ts +74 -0
- package/src/shared/visualization/ghost-cursor.ts +200 -0
- package/src/shared/visualization/highlight.ts +146 -0
- package/src/shared/visualization/index.ts +18 -0
- package/src/shared/workflow/workflow.ts +42 -0
- package/dist/index.cjs +0 -144
- package/dist/index.d.cts +0 -21
- package/dist/runtime/download/download.cjs +0 -70
- package/dist/runtime/download/download.d.cts +0 -35
- package/dist/runtime/download/index.cjs +0 -30
- package/dist/runtime/download/index.d.cts +0 -3
- package/dist/runtime/extract/extract.cjs +0 -88
- package/dist/runtime/extract/extract.d.cts +0 -23
- package/dist/runtime/extract/index.cjs +0 -28
- package/dist/runtime/extract/index.d.cts +0 -5
- package/dist/runtime/network/index.cjs +0 -28
- package/dist/runtime/network/index.d.cts +0 -4
- package/dist/runtime/network/network.cjs +0 -91
- package/dist/runtime/network/network.d.cts +0 -28
- package/dist/runtime/recovery/agent.cjs +0 -223
- package/dist/runtime/recovery/agent.d.cts +0 -13
- package/dist/runtime/recovery/errors.cjs +0 -124
- package/dist/runtime/recovery/errors.d.cts +0 -31
- package/dist/runtime/recovery/index.cjs +0 -34
- package/dist/runtime/recovery/index.d.cts +0 -7
- package/dist/runtime/recovery/recovery.cjs +0 -55
- package/dist/runtime/recovery/recovery.d.cts +0 -12
- package/dist/shared/condense-dom/condense-dom.d.cts +0 -34
- package/dist/shared/config/config.cjs +0 -44
- package/dist/shared/config/config.d.cts +0 -10
- package/dist/shared/config/index.cjs +0 -32
- package/dist/shared/config/index.d.cts +0 -1
- package/dist/shared/debug/index.cjs +0 -28
- package/dist/shared/debug/index.d.cts +0 -1
- package/dist/shared/debug/pause.cjs +0 -86
- package/dist/shared/debug/pause.d.cts +0 -12
- package/dist/shared/instrumentation/errors.cjs +0 -81
- package/dist/shared/instrumentation/errors.d.cts +0 -12
- package/dist/shared/instrumentation/index.cjs +0 -35
- package/dist/shared/instrumentation/index.d.cts +0 -6
- package/dist/shared/instrumentation/instrument.cjs +0 -206
- package/dist/shared/instrumentation/instrument.d.cts +0 -32
- package/dist/shared/llm/ai-sdk-adapter.cjs +0 -71
- package/dist/shared/llm/ai-sdk-adapter.d.cts +0 -22
- package/dist/shared/llm/client.cjs +0 -218
- package/dist/shared/llm/client.d.cts +0 -13
- package/dist/shared/llm/index.cjs +0 -31
- package/dist/shared/llm/index.d.cts +0 -5
- package/dist/shared/llm/types.cjs +0 -16
- package/dist/shared/llm/types.d.cts +0 -67
- package/dist/shared/logger/index.cjs +0 -37
- package/dist/shared/logger/index.d.cts +0 -2
- package/dist/shared/logger/logger.cjs +0 -232
- package/dist/shared/logger/logger.d.cts +0 -86
- package/dist/shared/logger/sinks.cjs +0 -160
- package/dist/shared/logger/sinks.d.cts +0 -9
- package/dist/shared/paths/paths.cjs +0 -104
- package/dist/shared/paths/paths.d.cts +0 -10
- package/dist/shared/run/api.cjs +0 -28
- package/dist/shared/run/api.d.cts +0 -2
- package/dist/shared/run/browser.cjs +0 -98
- package/dist/shared/run/browser.d.cts +0 -22
- package/dist/shared/state/index.cjs +0 -38
- package/dist/shared/state/index.d.cts +0 -2
- package/dist/shared/state/session-state.cjs +0 -92
- package/dist/shared/state/session-state.d.cts +0 -40
- package/dist/shared/visualization/ghost-cursor.cjs +0 -174
- package/dist/shared/visualization/ghost-cursor.d.cts +0 -37
- package/dist/shared/visualization/highlight.cjs +0 -134
- package/dist/shared/visualization/highlight.d.cts +0 -22
- package/dist/shared/visualization/index.cjs +0 -45
- package/dist/shared/visualization/index.d.cts +0 -3
- package/dist/shared/workflow/workflow.cjs +0 -47
- package/dist/shared/workflow/workflow.d.cts +0 -21
- package/skills/libretto/code-generation-rules.md +0 -223
- package/skills/libretto/integration-approach-selection.md +0 -174
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import type { Page } from "playwright";
|
|
2
|
+
import type z from "zod";
|
|
3
|
+
import { type MinimalLogger, defaultLogger } from "../../shared/logger/logger.js";
|
|
4
|
+
import type { LLMClient } from "../../shared/llm/types.js";
|
|
5
|
+
|
|
6
|
+
export type ExtractOptions<T extends z.ZodType> = {
|
|
7
|
+
page: Page;
|
|
8
|
+
instruction: string;
|
|
9
|
+
schema: T;
|
|
10
|
+
llmClient: LLMClient;
|
|
11
|
+
logger?: MinimalLogger;
|
|
12
|
+
/** Optional CSS selector to scope extraction to a specific element. */
|
|
13
|
+
selector?: string;
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Generic AI-powered data extraction from page elements.
|
|
18
|
+
* Takes a screenshot (full-page via CDP or scoped to an element),
|
|
19
|
+
* captures DOM content, and uses an LLM to extract structured data
|
|
20
|
+
* matching the provided Zod schema.
|
|
21
|
+
*/
|
|
22
|
+
export async function extractFromPage<T extends z.ZodType>(
|
|
23
|
+
options: ExtractOptions<T>,
|
|
24
|
+
): Promise<z.infer<T>> {
|
|
25
|
+
const { page, instruction, schema, selector, logger = defaultLogger, llmClient } = options;
|
|
26
|
+
|
|
27
|
+
let screenshot: string;
|
|
28
|
+
let domContent: string | undefined;
|
|
29
|
+
|
|
30
|
+
if (selector) {
|
|
31
|
+
const element = page.locator(selector);
|
|
32
|
+
await element.waitFor({ state: "visible", timeout: 10_000 });
|
|
33
|
+
|
|
34
|
+
const screenshotBuffer = await element.screenshot();
|
|
35
|
+
screenshot = screenshotBuffer.toString("base64");
|
|
36
|
+
|
|
37
|
+
try {
|
|
38
|
+
domContent = await element.innerHTML();
|
|
39
|
+
if (domContent.length > 30000) {
|
|
40
|
+
domContent = domContent.slice(0, 30000) + "\n... [truncated]";
|
|
41
|
+
}
|
|
42
|
+
} catch {
|
|
43
|
+
domContent = undefined;
|
|
44
|
+
}
|
|
45
|
+
} else {
|
|
46
|
+
const cdpClient = await page.context().newCDPSession(page);
|
|
47
|
+
await cdpClient.send("Page.enable");
|
|
48
|
+
const { data } = await cdpClient.send("Page.captureScreenshot", {
|
|
49
|
+
format: "png",
|
|
50
|
+
});
|
|
51
|
+
screenshot = data;
|
|
52
|
+
|
|
53
|
+
try {
|
|
54
|
+
const htmlContent = await page.content();
|
|
55
|
+
domContent =
|
|
56
|
+
htmlContent.length > 50000
|
|
57
|
+
? htmlContent.slice(0, 50000) + "\n... [truncated]"
|
|
58
|
+
: htmlContent;
|
|
59
|
+
} catch {
|
|
60
|
+
domContent = undefined;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const prompt = `You are analyzing a screenshot${selector ? " of a specific element" : ""} from a web page to extract structured data.
|
|
65
|
+
|
|
66
|
+
Instruction: ${instruction}
|
|
67
|
+
|
|
68
|
+
${domContent ? `Here is the HTML content for additional context:\n<html>\n${domContent}\n</html>` : ""}
|
|
69
|
+
|
|
70
|
+
Extract the requested information from the screenshot and return it in the specified format. Be precise and only extract what is visible.`;
|
|
71
|
+
|
|
72
|
+
const result = await llmClient.generateObjectFromMessages({
|
|
73
|
+
schema,
|
|
74
|
+
messages: [
|
|
75
|
+
{
|
|
76
|
+
role: "user",
|
|
77
|
+
content: [
|
|
78
|
+
{ type: "text", text: prompt },
|
|
79
|
+
{ type: "image", image: `data:image/png;base64,${screenshot}` },
|
|
80
|
+
],
|
|
81
|
+
},
|
|
82
|
+
],
|
|
83
|
+
temperature: 0,
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
logger.info("extractFromPage completed", {
|
|
87
|
+
selector,
|
|
88
|
+
instruction: instruction.slice(0, 100),
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
return result;
|
|
92
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { extractFromPage, type ExtractOptions } from "./extract.js";
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import type { Page } from "playwright";
|
|
2
|
+
import type z from "zod";
|
|
3
|
+
import type { MinimalLogger } from "../../shared/logger/logger.js";
|
|
4
|
+
|
|
5
|
+
export type RequestConfig = {
|
|
6
|
+
url: string;
|
|
7
|
+
method?: "GET" | "POST" | "PUT" | "DELETE" | "PATCH";
|
|
8
|
+
headers?: Record<string, string>;
|
|
9
|
+
body?: Record<string, any> | string;
|
|
10
|
+
/** How to serialize the body. Defaults to "json". */
|
|
11
|
+
bodyType?: "json" | "form";
|
|
12
|
+
/** How to parse the response. Defaults to "json". */
|
|
13
|
+
responseType?: "json" | "text" | "xml";
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
export type PageRequestOptions<T extends z.ZodType | undefined = undefined> = {
|
|
17
|
+
logger?: MinimalLogger;
|
|
18
|
+
/** Optional Zod schema to validate the response body. */
|
|
19
|
+
schema?: T;
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
type PageRequestResult<T extends z.ZodType | undefined> = T extends z.ZodType
|
|
23
|
+
? z.infer<T>
|
|
24
|
+
: any;
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Executes a fetch() call inside the browser context via page.evaluate().
|
|
28
|
+
* Provides typed request config, automatic response parsing, optional Zod
|
|
29
|
+
* validation, and logging.
|
|
30
|
+
*/
|
|
31
|
+
export async function pageRequest<T extends z.ZodType | undefined = undefined>(
|
|
32
|
+
page: Page,
|
|
33
|
+
config: RequestConfig,
|
|
34
|
+
options?: PageRequestOptions<T>,
|
|
35
|
+
): Promise<PageRequestResult<T>> {
|
|
36
|
+
const { url, method = "GET", headers = {}, body, bodyType = "json", responseType = "json" } = config;
|
|
37
|
+
const { logger, schema } = options ?? {};
|
|
38
|
+
|
|
39
|
+
const startTime = Date.now();
|
|
40
|
+
|
|
41
|
+
// Build fetch options to pass into page.evaluate
|
|
42
|
+
const fetchHeaders: Record<string, string> = { ...headers };
|
|
43
|
+
let fetchBody: string | undefined;
|
|
44
|
+
|
|
45
|
+
if (body !== undefined) {
|
|
46
|
+
if (bodyType === "form") {
|
|
47
|
+
fetchHeaders["Content-Type"] = "application/x-www-form-urlencoded";
|
|
48
|
+
if (typeof body === "string") {
|
|
49
|
+
fetchBody = body;
|
|
50
|
+
} else {
|
|
51
|
+
fetchBody = new URLSearchParams(
|
|
52
|
+
Object.entries(body).map(([k, v]) => [k, String(v)]),
|
|
53
|
+
).toString();
|
|
54
|
+
}
|
|
55
|
+
} else {
|
|
56
|
+
fetchHeaders["Content-Type"] = "application/json";
|
|
57
|
+
fetchBody = typeof body === "string" ? body : JSON.stringify(body);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const result = await page.evaluate(
|
|
62
|
+
async ({ url, method, headers, body, responseType }) => {
|
|
63
|
+
const res = await fetch(url, {
|
|
64
|
+
method,
|
|
65
|
+
headers,
|
|
66
|
+
body: body ?? undefined,
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
const status = res.status;
|
|
70
|
+
const ok = res.ok;
|
|
71
|
+
let data: any;
|
|
72
|
+
|
|
73
|
+
if (responseType === "json") {
|
|
74
|
+
data = await res.json();
|
|
75
|
+
} else {
|
|
76
|
+
data = await res.text();
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return { status, ok, data };
|
|
80
|
+
},
|
|
81
|
+
{ url, method, headers: fetchHeaders, body: fetchBody, responseType },
|
|
82
|
+
);
|
|
83
|
+
|
|
84
|
+
const duration = Date.now() - startTime;
|
|
85
|
+
|
|
86
|
+
if (!result.ok) {
|
|
87
|
+
logger?.warn("network:request:error", {
|
|
88
|
+
method,
|
|
89
|
+
url,
|
|
90
|
+
status: result.status,
|
|
91
|
+
duration,
|
|
92
|
+
body: typeof result.data === "string"
|
|
93
|
+
? result.data.slice(0, 500)
|
|
94
|
+
: undefined,
|
|
95
|
+
});
|
|
96
|
+
throw new Error(
|
|
97
|
+
`pageRequest failed: ${method} ${url} returned ${result.status}`,
|
|
98
|
+
);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
logger?.info("network:request", {
|
|
102
|
+
method,
|
|
103
|
+
url,
|
|
104
|
+
status: result.status,
|
|
105
|
+
duration,
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
if (schema) {
|
|
109
|
+
return schema.parse(result.data) as PageRequestResult<T>;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return result.data as PageRequestResult<T>;
|
|
113
|
+
}
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
import type { Page } from "playwright";
|
|
2
|
+
import { type MinimalLogger, defaultLogger } from "../../shared/logger/logger.js";
|
|
3
|
+
import type { LLMClient } from "../../shared/llm/types.js";
|
|
4
|
+
|
|
5
|
+
type BrowserAction =
|
|
6
|
+
| { type: "click"; x: number; y: number; button?: string }
|
|
7
|
+
| { type: "double_click"; x: number; y: number }
|
|
8
|
+
| {
|
|
9
|
+
type: "scroll";
|
|
10
|
+
x: number;
|
|
11
|
+
y: number;
|
|
12
|
+
scroll_x: number;
|
|
13
|
+
scroll_y: number;
|
|
14
|
+
}
|
|
15
|
+
| { type: "keypress"; keys: string[] }
|
|
16
|
+
| { type: "type"; text: string }
|
|
17
|
+
| { type: "wait" }
|
|
18
|
+
| { type: "screenshot" }
|
|
19
|
+
| { type: "drag"; path: { x: number; y: number }[] }
|
|
20
|
+
| { type: "move"; x: number; y: number }
|
|
21
|
+
| { type: "done" };
|
|
22
|
+
|
|
23
|
+
function delay(ms: number): Promise<void> {
|
|
24
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const KEY_MAPPINGS: Record<string, string> = {
|
|
28
|
+
ENTER: "Enter",
|
|
29
|
+
RETURN: "Enter",
|
|
30
|
+
TAB: "Tab",
|
|
31
|
+
SPACE: " ",
|
|
32
|
+
BACKSPACE: "Backspace",
|
|
33
|
+
DELETE: "Delete",
|
|
34
|
+
ESCAPE: "Escape",
|
|
35
|
+
ESC: "Escape",
|
|
36
|
+
UP: "ArrowUp",
|
|
37
|
+
DOWN: "ArrowDown",
|
|
38
|
+
LEFT: "ArrowLeft",
|
|
39
|
+
RIGHT: "ArrowRight",
|
|
40
|
+
HOME: "Home",
|
|
41
|
+
END: "End",
|
|
42
|
+
PAGEUP: "PageUp",
|
|
43
|
+
PAGEDOWN: "PageDown",
|
|
44
|
+
CTRL: "Control",
|
|
45
|
+
CONTROL: "Control",
|
|
46
|
+
ALT: "Alt",
|
|
47
|
+
SHIFT: "Shift",
|
|
48
|
+
META: "Meta",
|
|
49
|
+
CMD: "Meta",
|
|
50
|
+
COMMAND: "Meta",
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
function mapKeyName(key: string): string {
|
|
54
|
+
return KEY_MAPPINGS[key.toUpperCase()] ?? key;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
async function executeBrowserAction(
|
|
58
|
+
page: Page,
|
|
59
|
+
action: BrowserAction,
|
|
60
|
+
logger: MinimalLogger = defaultLogger,
|
|
61
|
+
): Promise<void> {
|
|
62
|
+
switch (action.type) {
|
|
63
|
+
case "click": {
|
|
64
|
+
const { x, y, button = "left" } = action;
|
|
65
|
+
const playwrightButton =
|
|
66
|
+
button === "wheel" || button === "back" || button === "forward"
|
|
67
|
+
? ("left" as const)
|
|
68
|
+
: (button as "left" | "right" | "middle");
|
|
69
|
+
await page.mouse.click(x, y, { button: playwrightButton });
|
|
70
|
+
logger.info(`Clicked at (${x}, ${y}) with ${button} button`);
|
|
71
|
+
break;
|
|
72
|
+
}
|
|
73
|
+
case "double_click": {
|
|
74
|
+
const { x, y } = action;
|
|
75
|
+
await page.mouse.dblclick(x, y);
|
|
76
|
+
logger.info(`Double-clicked at (${x}, ${y})`);
|
|
77
|
+
break;
|
|
78
|
+
}
|
|
79
|
+
case "scroll": {
|
|
80
|
+
const { x, y, scroll_x, scroll_y } = action;
|
|
81
|
+
await page.mouse.move(x, y);
|
|
82
|
+
await page.evaluate(`window.scrollBy(${scroll_x}, ${scroll_y})`);
|
|
83
|
+
logger.info(`Scrolled at (${x}, ${y}) by (${scroll_x}, ${scroll_y})`);
|
|
84
|
+
break;
|
|
85
|
+
}
|
|
86
|
+
case "keypress": {
|
|
87
|
+
for (const key of action.keys) {
|
|
88
|
+
const mapped = mapKeyName(key);
|
|
89
|
+
await page.keyboard.press(mapped);
|
|
90
|
+
logger.info(`Pressed key: ${key} (mapped to ${mapped})`);
|
|
91
|
+
}
|
|
92
|
+
break;
|
|
93
|
+
}
|
|
94
|
+
case "type": {
|
|
95
|
+
await page.keyboard.type(action.text);
|
|
96
|
+
logger.info(`Typed text: ${action.text}`);
|
|
97
|
+
break;
|
|
98
|
+
}
|
|
99
|
+
case "wait": {
|
|
100
|
+
await delay(2000);
|
|
101
|
+
logger.info("Waited 2 seconds");
|
|
102
|
+
break;
|
|
103
|
+
}
|
|
104
|
+
case "screenshot": {
|
|
105
|
+
logger.info("Screenshot action (no-op, taken automatically)");
|
|
106
|
+
break;
|
|
107
|
+
}
|
|
108
|
+
case "drag": {
|
|
109
|
+
const { path } = action;
|
|
110
|
+
const start = path[0];
|
|
111
|
+
const end = path[path.length - 1];
|
|
112
|
+
if (path.length >= 2 && start && end) {
|
|
113
|
+
await page.mouse.move(start.x, start.y);
|
|
114
|
+
await page.mouse.down();
|
|
115
|
+
for (let i = 1; i < path.length; i++) {
|
|
116
|
+
const point = path[i];
|
|
117
|
+
if (point) await page.mouse.move(point.x, point.y);
|
|
118
|
+
}
|
|
119
|
+
await page.mouse.up();
|
|
120
|
+
logger.info(`Dragged from (${start.x}, ${start.y}) to (${end.x}, ${end.y})`);
|
|
121
|
+
}
|
|
122
|
+
break;
|
|
123
|
+
}
|
|
124
|
+
case "move": {
|
|
125
|
+
const { x, y } = action;
|
|
126
|
+
await page.mouse.move(x, y);
|
|
127
|
+
logger.info(`Moved mouse to (${x}, ${y})`);
|
|
128
|
+
break;
|
|
129
|
+
}
|
|
130
|
+
case "done": {
|
|
131
|
+
break;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
import { z } from "zod";
|
|
137
|
+
|
|
138
|
+
const recoveryActionSchema = z.object({
|
|
139
|
+
reasoning: z
|
|
140
|
+
.string()
|
|
141
|
+
.describe("Your reasoning about what you see and what action to take"),
|
|
142
|
+
action: z.discriminatedUnion("type", [
|
|
143
|
+
z.object({
|
|
144
|
+
type: z.literal("click"),
|
|
145
|
+
x: z.number(),
|
|
146
|
+
y: z.number(),
|
|
147
|
+
}),
|
|
148
|
+
z.object({
|
|
149
|
+
type: z.literal("type"),
|
|
150
|
+
text: z.string(),
|
|
151
|
+
}),
|
|
152
|
+
z.object({
|
|
153
|
+
type: z.literal("keypress"),
|
|
154
|
+
keys: z.array(z.string()),
|
|
155
|
+
}),
|
|
156
|
+
z.object({
|
|
157
|
+
type: z.literal("scroll"),
|
|
158
|
+
x: z.number(),
|
|
159
|
+
y: z.number(),
|
|
160
|
+
scroll_x: z.number(),
|
|
161
|
+
scroll_y: z.number(),
|
|
162
|
+
}),
|
|
163
|
+
z.object({
|
|
164
|
+
type: z.literal("wait"),
|
|
165
|
+
}),
|
|
166
|
+
z.object({
|
|
167
|
+
type: z.literal("done"),
|
|
168
|
+
}),
|
|
169
|
+
]),
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* Executes a vision-based recovery agent to recover from browser automation failures.
|
|
174
|
+
* Takes a screenshot, sends it to the LLM with the instruction, and executes
|
|
175
|
+
* the LLM's suggested browser actions.
|
|
176
|
+
*/
|
|
177
|
+
export async function executeRecoveryAgent(
|
|
178
|
+
page: Page,
|
|
179
|
+
instruction: string,
|
|
180
|
+
logger?: MinimalLogger,
|
|
181
|
+
llmClient?: LLMClient,
|
|
182
|
+
): Promise<void> {
|
|
183
|
+
if (!llmClient) {
|
|
184
|
+
return;
|
|
185
|
+
}
|
|
186
|
+
const log = logger ?? defaultLogger;
|
|
187
|
+
log.info("Executing vision-based recovery agent", { instruction });
|
|
188
|
+
|
|
189
|
+
const viewport = page.viewportSize();
|
|
190
|
+
if (!viewport) {
|
|
191
|
+
throw new Error("Viewport size not found");
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
let screenshot: string;
|
|
195
|
+
try {
|
|
196
|
+
screenshot = (
|
|
197
|
+
await page.screenshot({ fullPage: false, timeout: 10000 })
|
|
198
|
+
).toString("base64");
|
|
199
|
+
} catch (screenshotError) {
|
|
200
|
+
log.warn("Failed to take screenshot for recovery agent, skipping", {
|
|
201
|
+
screenshotError:
|
|
202
|
+
screenshotError instanceof Error
|
|
203
|
+
? screenshotError.message
|
|
204
|
+
: String(screenshotError),
|
|
205
|
+
});
|
|
206
|
+
throw new Error("Failed to take screenshot for recovery agent");
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
const maxSteps = 3;
|
|
210
|
+
for (let step = 1; step <= maxSteps; step++) {
|
|
211
|
+
const result = await llmClient.generateObjectFromMessages({
|
|
212
|
+
schema: recoveryActionSchema,
|
|
213
|
+
messages: [
|
|
214
|
+
{
|
|
215
|
+
role: "user",
|
|
216
|
+
content: [
|
|
217
|
+
{
|
|
218
|
+
type: "text",
|
|
219
|
+
text: `You are an expert browser support agent. Your job is to resolve issues when browser automation encounters unexpected website behavior (e.g., popups blocking interaction).
|
|
220
|
+
|
|
221
|
+
Your task: ${instruction}
|
|
222
|
+
|
|
223
|
+
Viewport: ${viewport.width}x${viewport.height}px. Complete this in as few steps as possible.
|
|
224
|
+
Analyze the screenshot and decide what action to take. If the task is complete or no action is needed, use the "done" action type.`,
|
|
225
|
+
},
|
|
226
|
+
{
|
|
227
|
+
type: "image",
|
|
228
|
+
image: `data:image/png;base64,${screenshot}`,
|
|
229
|
+
},
|
|
230
|
+
],
|
|
231
|
+
},
|
|
232
|
+
],
|
|
233
|
+
temperature: 0,
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
log.info(`Recovery step ${step}/${maxSteps}`, {
|
|
237
|
+
reasoning: result.reasoning,
|
|
238
|
+
action: result.action,
|
|
239
|
+
});
|
|
240
|
+
|
|
241
|
+
if (result.action.type === "done") {
|
|
242
|
+
log.info("Recovery agent completed - no more actions needed");
|
|
243
|
+
break;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
await executeBrowserAction(page, result.action, log);
|
|
247
|
+
await delay(2000);
|
|
248
|
+
|
|
249
|
+
// Take new screenshot for next iteration
|
|
250
|
+
screenshot = (await page.screenshot({ fullPage: false })).toString(
|
|
251
|
+
"base64",
|
|
252
|
+
);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
log.info("Recovery agent execution completed");
|
|
256
|
+
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import type { Page } from "playwright";
|
|
2
|
+
import { type MinimalLogger, defaultLogger } from "../../shared/logger/logger.js";
|
|
3
|
+
import type { LLMClient } from "../../shared/llm/types.js";
|
|
4
|
+
import { z } from "zod";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Known error type for classifying submission errors.
|
|
8
|
+
* errorPatterns are what the LLM should look for on screen.
|
|
9
|
+
* userMessage is the friendly message returned when matched.
|
|
10
|
+
*/
|
|
11
|
+
export type KnownSubmissionError = {
|
|
12
|
+
id: string;
|
|
13
|
+
errorPatterns: string[];
|
|
14
|
+
userMessage: string;
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
export type DetectedSubmissionError = {
|
|
18
|
+
matched: true;
|
|
19
|
+
errorId: string;
|
|
20
|
+
message: string;
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
const detectSubmissionErrorSchema = z.object({
|
|
24
|
+
hasError: z.boolean().describe("Whether an error is visible on the page"),
|
|
25
|
+
matchedKnownErrorId: z
|
|
26
|
+
.string()
|
|
27
|
+
.nullable()
|
|
28
|
+
.describe("The ID of the matched known error, or null if no match"),
|
|
29
|
+
errorMessage: z
|
|
30
|
+
.string()
|
|
31
|
+
.nullable()
|
|
32
|
+
.describe("The error message visible on screen, or null if no error"),
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Uses screenshot + LLM vision to detect if an error occurred during a submission process.
|
|
37
|
+
* Captures a screenshot via CDP (handles unresponsive pages), sends it to the LLM,
|
|
38
|
+
* and checks against the provided known error patterns.
|
|
39
|
+
*
|
|
40
|
+
* @returns DetectedSubmissionError if a known error is matched
|
|
41
|
+
* @throws The original error if no known error matches
|
|
42
|
+
*/
|
|
43
|
+
export async function detectSubmissionError(
|
|
44
|
+
page: Page,
|
|
45
|
+
error: unknown,
|
|
46
|
+
logContext: string,
|
|
47
|
+
llmClient: LLMClient,
|
|
48
|
+
knownErrors: KnownSubmissionError[] = [],
|
|
49
|
+
logger?: MinimalLogger,
|
|
50
|
+
): Promise<DetectedSubmissionError> {
|
|
51
|
+
const log = logger ?? defaultLogger;
|
|
52
|
+
// Capture screenshot using CDP to handle unresponsive pages
|
|
53
|
+
let screenshot: string;
|
|
54
|
+
let domSnapshot: string | undefined;
|
|
55
|
+
|
|
56
|
+
try {
|
|
57
|
+
const cdpClient = await page.context().newCDPSession(page);
|
|
58
|
+
await cdpClient.send("Page.enable");
|
|
59
|
+
const { data } = await cdpClient.send("Page.captureScreenshot", {
|
|
60
|
+
format: "png",
|
|
61
|
+
});
|
|
62
|
+
screenshot = data;
|
|
63
|
+
} catch (screenshotError) {
|
|
64
|
+
log.warn(
|
|
65
|
+
"Failed to take screenshot via CDP for error detection, skipping LLM analysis",
|
|
66
|
+
{ screenshotError, originalError: error },
|
|
67
|
+
);
|
|
68
|
+
throw error;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Capture DOM snapshot for additional context
|
|
72
|
+
try {
|
|
73
|
+
const htmlContent = await page.content();
|
|
74
|
+
domSnapshot =
|
|
75
|
+
htmlContent.length > 50000
|
|
76
|
+
? htmlContent.slice(0, 50000) + "\n... [truncated]"
|
|
77
|
+
: htmlContent;
|
|
78
|
+
} catch (domError) {
|
|
79
|
+
log.warn("Failed to capture DOM snapshot", {
|
|
80
|
+
domError: domError instanceof Error ? domError.message : String(domError),
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const knownErrorsDescription =
|
|
85
|
+
knownErrors.length > 0
|
|
86
|
+
? `\nKnown error patterns to look for:\n${knownErrors.map((e, i) => `${i + 1}. ID: "${e.id}" - Patterns: ${e.errorPatterns.join(", ")}`).join("\n")}\n`
|
|
87
|
+
: "";
|
|
88
|
+
|
|
89
|
+
const prompt = `You are analyzing a screenshot and DOM of a web page to detect if an error occurred during a browser automation process.
|
|
90
|
+
|
|
91
|
+
Context: ${logContext}
|
|
92
|
+
|
|
93
|
+
${knownErrorsDescription}
|
|
94
|
+
|
|
95
|
+
Analyze the screenshot and DOM snapshot to determine:
|
|
96
|
+
1. Is there any error message, warning, or indication of failure visible on the page?
|
|
97
|
+
2. If yes, does it match any of the known error patterns listed above?
|
|
98
|
+
3. What is the exact error message or description of the problem?
|
|
99
|
+
|
|
100
|
+
IMPORTANT:
|
|
101
|
+
- Look carefully for error alerts, warning banners, error modals, red text, or any indication of failure
|
|
102
|
+
- Check the DOM snapshot for error messages that may not be visible in the screenshot
|
|
103
|
+
- If you see a known error pattern, use its exact ID in matchedKnownErrorId
|
|
104
|
+
- If there's an error but it doesn't match any known pattern, set matchedKnownErrorId to null
|
|
105
|
+
- If the page looks normal with no errors, set hasError to false
|
|
106
|
+
|
|
107
|
+
${domSnapshot ? `<dom_snapshot>\n${domSnapshot}\n</dom_snapshot>` : ""}`;
|
|
108
|
+
|
|
109
|
+
const result = await llmClient.generateObjectFromMessages({
|
|
110
|
+
schema: detectSubmissionErrorSchema,
|
|
111
|
+
messages: [
|
|
112
|
+
{
|
|
113
|
+
role: "user",
|
|
114
|
+
content: [
|
|
115
|
+
{ type: "text", text: prompt },
|
|
116
|
+
{ type: "image", image: `data:image/png;base64,${screenshot}` },
|
|
117
|
+
],
|
|
118
|
+
},
|
|
119
|
+
],
|
|
120
|
+
temperature: 0,
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
if (!result.hasError) {
|
|
124
|
+
log.info("No error detected by LLM", { result });
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Check if it matches a known error
|
|
128
|
+
if (result.matchedKnownErrorId) {
|
|
129
|
+
const knownError = knownErrors.find(
|
|
130
|
+
(e) => e.id === result.matchedKnownErrorId,
|
|
131
|
+
);
|
|
132
|
+
if (knownError) {
|
|
133
|
+
log.warn(logContext, {
|
|
134
|
+
error,
|
|
135
|
+
browserError: result.errorMessage,
|
|
136
|
+
knownErrorId: result.matchedKnownErrorId,
|
|
137
|
+
});
|
|
138
|
+
return {
|
|
139
|
+
matched: true,
|
|
140
|
+
errorId: knownError.id,
|
|
141
|
+
message: knownError.userMessage,
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Log and re-throw for unknown errors
|
|
147
|
+
log.warn(logContext, {
|
|
148
|
+
error,
|
|
149
|
+
browserError: result.errorMessage,
|
|
150
|
+
});
|
|
151
|
+
throw error;
|
|
152
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import type { Page } from "playwright";
|
|
2
|
+
import { type MinimalLogger, defaultLogger } from "../../shared/logger/logger.js";
|
|
3
|
+
import type { LLMClient } from "../../shared/llm/types.js";
|
|
4
|
+
import { executeRecoveryAgent } from "./agent.js";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Attempts to execute a function, and if it fails, runs popup recovery
|
|
8
|
+
* (if an LLM client is provided) and retries the function once.
|
|
9
|
+
*/
|
|
10
|
+
export async function attemptWithRecovery<T>(
|
|
11
|
+
page: Page,
|
|
12
|
+
fn: () => Promise<T>,
|
|
13
|
+
logger?: MinimalLogger,
|
|
14
|
+
llmClient?: LLMClient,
|
|
15
|
+
): Promise<T> {
|
|
16
|
+
const log = logger ?? defaultLogger;
|
|
17
|
+
try {
|
|
18
|
+
return await fn();
|
|
19
|
+
} catch (error) {
|
|
20
|
+
// Don't attempt recovery if the browser/page is closed
|
|
21
|
+
if (
|
|
22
|
+
error instanceof Error &&
|
|
23
|
+
(error.message.includes("Target closed") ||
|
|
24
|
+
error.message.includes("browser has been closed") ||
|
|
25
|
+
error.message.includes("context or browser has been closed"))
|
|
26
|
+
) {
|
|
27
|
+
log.warn("Page/browser has been closed, cannot recover", {
|
|
28
|
+
error: error.message,
|
|
29
|
+
});
|
|
30
|
+
throw error;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
if (!llmClient) {
|
|
34
|
+
throw error;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
log.info("Action failed, attempting popup recovery", {
|
|
38
|
+
error: error instanceof Error ? error.message : String(error),
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
await executeRecoveryAgent(
|
|
42
|
+
page,
|
|
43
|
+
"Look at the page to see if there is a popup blocking the screen. If so, close the popup.",
|
|
44
|
+
log,
|
|
45
|
+
llmClient,
|
|
46
|
+
);
|
|
47
|
+
|
|
48
|
+
return await fn();
|
|
49
|
+
}
|
|
50
|
+
}
|