libretto 0.6.21 → 0.6.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/README.template.md +5 -1
- package/dist/cli/commands/execution.js +8 -1
- package/dist/cli/core/browser.js +8 -3
- package/dist/cli/core/daemon/daemon.js +8 -6
- package/dist/cli/core/providers/kernel.js +107 -29
- package/dist/cli/core/providers/steel.js +10 -1
- package/dist/index.d.ts +3 -2
- package/dist/index.js +15 -1
- package/dist/runtime/recovery/agent.d.ts +50 -2
- package/dist/runtime/recovery/agent.js +159 -45
- package/dist/runtime/recovery/index.d.ts +2 -1
- package/dist/runtime/recovery/index.js +16 -2
- package/dist/runtime/recovery/page-fallbacks.d.ts +45 -0
- package/dist/runtime/recovery/page-fallbacks.js +389 -0
- package/dist/shared/state/index.d.ts +1 -1
- package/dist/shared/state/session-state.d.ts +4 -1
- package/dist/shared/state/session-state.js +2 -1
- package/dist/shared/workflow/workflow.d.ts +19 -6
- package/dist/shared/workflow/workflow.js +38 -9
- package/docs/reference/runtime/page-fallbacks.mdx +85 -0
- package/docs/understand-libretto/error-handling-and-recovery.mdx +45 -0
- package/package.json +4 -12
- package/skills/libretto/SKILL.md +8 -2
- package/skills/libretto/references/code-generation-rules.md +23 -6
- package/skills/libretto-readonly/SKILL.md +1 -1
- package/src/cli/commands/execution.ts +8 -1
- package/src/cli/core/browser.ts +7 -2
- package/src/cli/core/daemon/daemon.ts +9 -4
- package/src/cli/core/daemon/ipc.ts +1 -0
- package/src/cli/core/providers/kernel.ts +153 -29
- package/src/cli/core/providers/steel.ts +11 -1
- package/src/cli/core/providers/types.ts +3 -0
- package/src/index.ts +22 -2
- package/src/runtime/recovery/agent.ts +227 -50
- package/src/runtime/recovery/index.ts +21 -1
- package/src/runtime/recovery/page-fallbacks.ts +527 -0
- package/src/shared/state/index.ts +1 -0
- package/src/shared/state/session-state.ts +2 -0
- package/src/shared/workflow/workflow.ts +90 -20
|
@@ -5,7 +5,7 @@ import {
|
|
|
5
5
|
} from "../../shared/logger/logger.js";
|
|
6
6
|
import { generateObject, type LanguageModel } from "ai";
|
|
7
7
|
|
|
8
|
-
type BrowserAction =
|
|
8
|
+
export type BrowserAction =
|
|
9
9
|
| { type: "click"; x: number; y: number; button?: string }
|
|
10
10
|
| { type: "double_click"; x: number; y: number }
|
|
11
11
|
| {
|
|
@@ -23,6 +23,35 @@ type BrowserAction =
|
|
|
23
23
|
| { type: "move"; x: number; y: number }
|
|
24
24
|
| { type: "done" };
|
|
25
25
|
|
|
26
|
+
export type RecoveryAgentStep = {
|
|
27
|
+
step: number;
|
|
28
|
+
reasoning: string;
|
|
29
|
+
action: BrowserAction;
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
export type RecoveryAgentStatus =
|
|
33
|
+
| "skipped"
|
|
34
|
+
| "no-action-needed"
|
|
35
|
+
| "action-taken"
|
|
36
|
+
| "incomplete";
|
|
37
|
+
|
|
38
|
+
export type RecoveryAgentResult = {
|
|
39
|
+
status: RecoveryAgentStatus;
|
|
40
|
+
steps: RecoveryAgentStep[];
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
type ImageDimensions = {
|
|
44
|
+
width: number;
|
|
45
|
+
height: number;
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
type CoordinateScale = {
|
|
49
|
+
scaleX: number;
|
|
50
|
+
scaleY: number;
|
|
51
|
+
viewportWidth: number;
|
|
52
|
+
viewportHeight: number;
|
|
53
|
+
};
|
|
54
|
+
|
|
26
55
|
function delay(ms: number): Promise<void> {
|
|
27
56
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
28
57
|
}
|
|
@@ -57,6 +86,101 @@ function mapKeyName(key: string): string {
|
|
|
57
86
|
return KEY_MAPPINGS[key.toUpperCase()] ?? key;
|
|
58
87
|
}
|
|
59
88
|
|
|
89
|
+
function clamp(value: number, min: number, max: number): number {
|
|
90
|
+
return Math.min(Math.max(value, min), max);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function scalePoint(
|
|
94
|
+
x: number,
|
|
95
|
+
y: number,
|
|
96
|
+
scale: CoordinateScale,
|
|
97
|
+
): { x: number; y: number } {
|
|
98
|
+
return {
|
|
99
|
+
x: clamp(x * scale.scaleX, 0, Math.max(scale.viewportWidth - 1, 0)),
|
|
100
|
+
y: clamp(y * scale.scaleY, 0, Math.max(scale.viewportHeight - 1, 0)),
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function scaleBrowserAction(
|
|
105
|
+
action: BrowserAction,
|
|
106
|
+
scale: CoordinateScale,
|
|
107
|
+
): BrowserAction {
|
|
108
|
+
switch (action.type) {
|
|
109
|
+
case "click": {
|
|
110
|
+
const point = scalePoint(action.x, action.y, scale);
|
|
111
|
+
return { ...action, ...point };
|
|
112
|
+
}
|
|
113
|
+
case "double_click": {
|
|
114
|
+
const point = scalePoint(action.x, action.y, scale);
|
|
115
|
+
return { ...action, ...point };
|
|
116
|
+
}
|
|
117
|
+
case "scroll": {
|
|
118
|
+
const point = scalePoint(action.x, action.y, scale);
|
|
119
|
+
return {
|
|
120
|
+
...action,
|
|
121
|
+
...point,
|
|
122
|
+
scroll_x: action.scroll_x * scale.scaleX,
|
|
123
|
+
scroll_y: action.scroll_y * scale.scaleY,
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
case "drag":
|
|
127
|
+
return {
|
|
128
|
+
...action,
|
|
129
|
+
path: action.path.map((point) => scalePoint(point.x, point.y, scale)),
|
|
130
|
+
};
|
|
131
|
+
case "move": {
|
|
132
|
+
const point = scalePoint(action.x, action.y, scale);
|
|
133
|
+
return { ...action, ...point };
|
|
134
|
+
}
|
|
135
|
+
case "keypress":
|
|
136
|
+
case "type":
|
|
137
|
+
case "wait":
|
|
138
|
+
case "screenshot":
|
|
139
|
+
case "done":
|
|
140
|
+
return action;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
function readPngDimensions(buffer: Buffer): ImageDimensions {
|
|
145
|
+
const pngSignature = "89504e470d0a1a0a";
|
|
146
|
+
if (buffer.subarray(0, 8).toString("hex") !== pngSignature) {
|
|
147
|
+
throw new Error("Recovery screenshot is not a PNG image.");
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
return {
|
|
151
|
+
width: buffer.readUInt32BE(16),
|
|
152
|
+
height: buffer.readUInt32BE(20),
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
async function takeViewportScreenshot(page: Page): Promise<{
|
|
157
|
+
screenshot: Buffer;
|
|
158
|
+
dimensions: ImageDimensions;
|
|
159
|
+
scale: CoordinateScale;
|
|
160
|
+
}> {
|
|
161
|
+
const viewport = page.viewportSize();
|
|
162
|
+
if (!viewport) {
|
|
163
|
+
throw new Error("Viewport size not found");
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
const screenshot = await page.screenshot({
|
|
167
|
+
fullPage: false,
|
|
168
|
+
scale: "css",
|
|
169
|
+
timeout: 10000,
|
|
170
|
+
});
|
|
171
|
+
const dimensions = readPngDimensions(screenshot);
|
|
172
|
+
return {
|
|
173
|
+
screenshot,
|
|
174
|
+
dimensions,
|
|
175
|
+
scale: {
|
|
176
|
+
scaleX: viewport.width / dimensions.width,
|
|
177
|
+
scaleY: viewport.height / dimensions.height,
|
|
178
|
+
viewportWidth: viewport.width,
|
|
179
|
+
viewportHeight: viewport.height,
|
|
180
|
+
},
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
|
|
60
184
|
async function executeBrowserAction(
|
|
61
185
|
page: Page,
|
|
62
186
|
action: BrowserAction,
|
|
@@ -144,36 +268,81 @@ const recoveryActionSchema = z.object({
|
|
|
144
268
|
reasoning: z
|
|
145
269
|
.string()
|
|
146
270
|
.describe("Your reasoning about what you see and what action to take"),
|
|
147
|
-
action: z.
|
|
148
|
-
z
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
}),
|
|
168
|
-
z.object({
|
|
169
|
-
type: z.literal("wait"),
|
|
170
|
-
}),
|
|
171
|
-
z.object({
|
|
172
|
-
type: z.literal("done"),
|
|
173
|
-
}),
|
|
174
|
-
]),
|
|
271
|
+
action: z.object({
|
|
272
|
+
type: z
|
|
273
|
+
.enum(["click", "type", "keypress", "scroll", "wait", "done"])
|
|
274
|
+
.describe("The browser action to execute."),
|
|
275
|
+
x: z
|
|
276
|
+
.number()
|
|
277
|
+
.nullable()
|
|
278
|
+
.describe("The screenshot pixel x coordinate for click/scroll."),
|
|
279
|
+
y: z
|
|
280
|
+
.number()
|
|
281
|
+
.nullable()
|
|
282
|
+
.describe("The screenshot pixel y coordinate for click/scroll."),
|
|
283
|
+
text: z.string().nullable().describe("Text for type actions."),
|
|
284
|
+
keys: z
|
|
285
|
+
.array(z.string())
|
|
286
|
+
.nullable()
|
|
287
|
+
.describe("Keys for keypress actions."),
|
|
288
|
+
scroll_x: z.number().nullable().describe("Horizontal scroll delta."),
|
|
289
|
+
scroll_y: z.number().nullable().describe("Vertical scroll delta."),
|
|
290
|
+
}),
|
|
175
291
|
});
|
|
176
292
|
|
|
293
|
+
function numberOrThrow(value: number | null, field: string): number {
|
|
294
|
+
if (typeof value === "number") return value;
|
|
295
|
+
throw new Error(`Recovery action is missing ${field}.`);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
function normalizeRecoveryAction(
|
|
299
|
+
action: z.infer<typeof recoveryActionSchema>["action"],
|
|
300
|
+
): BrowserAction {
|
|
301
|
+
switch (action.type) {
|
|
302
|
+
case "click":
|
|
303
|
+
return {
|
|
304
|
+
type: "click",
|
|
305
|
+
x: numberOrThrow(action.x, "x"),
|
|
306
|
+
y: numberOrThrow(action.y, "y"),
|
|
307
|
+
};
|
|
308
|
+
case "type":
|
|
309
|
+
return { type: "type", text: action.text ?? "" };
|
|
310
|
+
case "keypress":
|
|
311
|
+
return { type: "keypress", keys: action.keys ?? [] };
|
|
312
|
+
case "scroll":
|
|
313
|
+
return {
|
|
314
|
+
type: "scroll",
|
|
315
|
+
x: numberOrThrow(action.x, "x"),
|
|
316
|
+
y: numberOrThrow(action.y, "y"),
|
|
317
|
+
scroll_x: numberOrThrow(action.scroll_x, "scroll_x"),
|
|
318
|
+
scroll_y: numberOrThrow(action.scroll_y, "scroll_y"),
|
|
319
|
+
};
|
|
320
|
+
case "wait":
|
|
321
|
+
return { type: "wait" };
|
|
322
|
+
case "done":
|
|
323
|
+
return { type: "done" };
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
function getRecoveryStatus(steps: RecoveryAgentStep[]): RecoveryAgentStatus {
|
|
328
|
+
if (steps.length === 0) {
|
|
329
|
+
return "skipped";
|
|
330
|
+
}
|
|
331
|
+
const actionSteps = steps.filter((step) => step.action.type !== "done");
|
|
332
|
+
const completed = steps.at(-1)?.action.type === "done";
|
|
333
|
+
if (actionSteps.length === 0 && completed) {
|
|
334
|
+
return "no-action-needed";
|
|
335
|
+
}
|
|
336
|
+
if (completed) {
|
|
337
|
+
return "action-taken";
|
|
338
|
+
}
|
|
339
|
+
return "incomplete";
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// A step is one screenshot -> model decision -> browser action cycle.
|
|
343
|
+
// Three covers common popup flows like close/confirm/done while bounding cost.
|
|
344
|
+
const DEFAULT_RECOVERY_MAX_STEPS = 3;
|
|
345
|
+
|
|
177
346
|
/**
|
|
178
347
|
* Executes a vision-based recovery agent to recover from browser automation failures.
|
|
179
348
|
* Takes a screenshot, sends it to the LLM with the instruction, and executes
|
|
@@ -184,23 +353,17 @@ export async function executeRecoveryAgent(
|
|
|
184
353
|
instruction: string,
|
|
185
354
|
logger?: MinimalLogger,
|
|
186
355
|
model?: LanguageModel,
|
|
187
|
-
|
|
356
|
+
maxSteps = DEFAULT_RECOVERY_MAX_STEPS,
|
|
357
|
+
): Promise<RecoveryAgentResult> {
|
|
188
358
|
if (!model) {
|
|
189
|
-
return;
|
|
359
|
+
return { status: "skipped", steps: [] };
|
|
190
360
|
}
|
|
191
361
|
const log = logger ?? defaultLogger;
|
|
192
362
|
log.info("Executing vision-based recovery agent", { instruction });
|
|
193
363
|
|
|
194
|
-
|
|
195
|
-
if (!viewport) {
|
|
196
|
-
throw new Error("Viewport size not found");
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
let screenshot: string;
|
|
364
|
+
let screenshotState: Awaited<ReturnType<typeof takeViewportScreenshot>>;
|
|
200
365
|
try {
|
|
201
|
-
|
|
202
|
-
await page.screenshot({ fullPage: false, timeout: 10000 })
|
|
203
|
-
).toString("base64");
|
|
366
|
+
screenshotState = await takeViewportScreenshot(page);
|
|
204
367
|
} catch (screenshotError) {
|
|
205
368
|
log.warn("Failed to take screenshot for recovery agent, skipping", {
|
|
206
369
|
screenshotError:
|
|
@@ -211,8 +374,9 @@ export async function executeRecoveryAgent(
|
|
|
211
374
|
throw new Error("Failed to take screenshot for recovery agent");
|
|
212
375
|
}
|
|
213
376
|
|
|
214
|
-
const
|
|
377
|
+
const steps: RecoveryAgentStep[] = [];
|
|
215
378
|
for (let step = 1; step <= maxSteps; step++) {
|
|
379
|
+
const { screenshot, dimensions, scale } = screenshotState;
|
|
216
380
|
const { object: result } = await generateObject({
|
|
217
381
|
model,
|
|
218
382
|
schema: recoveryActionSchema,
|
|
@@ -226,12 +390,12 @@ export async function executeRecoveryAgent(
|
|
|
226
390
|
|
|
227
391
|
Your task: ${instruction}
|
|
228
392
|
|
|
229
|
-
|
|
393
|
+
Screenshot: ${dimensions.width}x${dimensions.height}px. Coordinates must be screenshot pixel coordinates relative to the top-left corner of the screenshot. Complete this in as few steps as possible.
|
|
230
394
|
Analyze the screenshot and decide what action to take. If the task is complete or no action is needed, use the "done" action type.`,
|
|
231
395
|
},
|
|
232
396
|
{
|
|
233
397
|
type: "image",
|
|
234
|
-
image:
|
|
398
|
+
image: screenshot,
|
|
235
399
|
},
|
|
236
400
|
],
|
|
237
401
|
},
|
|
@@ -239,24 +403,37 @@ Analyze the screenshot and decide what action to take. If the task is complete o
|
|
|
239
403
|
temperature: 0,
|
|
240
404
|
});
|
|
241
405
|
|
|
406
|
+
const imageAction = normalizeRecoveryAction(result.action);
|
|
407
|
+
const action = scaleBrowserAction(imageAction, scale);
|
|
242
408
|
log.info(`Recovery step ${step}/${maxSteps}`, {
|
|
243
409
|
reasoning: result.reasoning,
|
|
244
|
-
|
|
410
|
+
imageAction,
|
|
411
|
+
action,
|
|
412
|
+
screenshot: dimensions,
|
|
413
|
+
scale,
|
|
414
|
+
});
|
|
415
|
+
steps.push({
|
|
416
|
+
step,
|
|
417
|
+
reasoning: result.reasoning,
|
|
418
|
+
action,
|
|
245
419
|
});
|
|
246
420
|
|
|
247
|
-
if (
|
|
421
|
+
if (action.type === "done") {
|
|
248
422
|
log.info("Recovery agent completed - no more actions needed");
|
|
249
423
|
break;
|
|
250
424
|
}
|
|
251
425
|
|
|
252
|
-
await executeBrowserAction(page,
|
|
426
|
+
await executeBrowserAction(page, action, log);
|
|
253
427
|
await delay(2000);
|
|
254
428
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
);
|
|
429
|
+
if (step < maxSteps) {
|
|
430
|
+
screenshotState = await takeViewportScreenshot(page);
|
|
431
|
+
}
|
|
259
432
|
}
|
|
260
433
|
|
|
261
434
|
log.info("Recovery agent execution completed");
|
|
435
|
+
return {
|
|
436
|
+
status: getRecoveryStatus(steps),
|
|
437
|
+
steps,
|
|
438
|
+
};
|
|
262
439
|
}
|
|
@@ -1,7 +1,27 @@
|
|
|
1
|
-
export {
|
|
1
|
+
export {
|
|
2
|
+
executeRecoveryAgent,
|
|
3
|
+
type BrowserAction,
|
|
4
|
+
type RecoveryAgentResult,
|
|
5
|
+
type RecoveryAgentStep,
|
|
6
|
+
} from "./agent.js";
|
|
2
7
|
export { attemptWithRecovery } from "./recovery.js";
|
|
3
8
|
export {
|
|
4
9
|
detectSubmissionError,
|
|
5
10
|
type KnownSubmissionError,
|
|
6
11
|
type DetectedSubmissionError,
|
|
7
12
|
} from "./errors.js";
|
|
13
|
+
export {
|
|
14
|
+
COMPUTER_USE_RECOVERY_MODELS,
|
|
15
|
+
POPUP_RECOVERY_INSTRUCTION,
|
|
16
|
+
computerUseRecoveryAction,
|
|
17
|
+
createRecoveryPage,
|
|
18
|
+
popupRecoveryAction,
|
|
19
|
+
type ComputerUseRecoveryActionOptions,
|
|
20
|
+
type PopupRecoveryActionOptions,
|
|
21
|
+
type RecoveryActionContext,
|
|
22
|
+
type RecoveryAction,
|
|
23
|
+
type RecoveryActionHandler,
|
|
24
|
+
type RecoveryActionOptions,
|
|
25
|
+
type RecoveryActionResult,
|
|
26
|
+
type RecoveryActionTargetType,
|
|
27
|
+
} from "./page-fallbacks.js";
|