libretto 0.6.21 → 0.6.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +5 -1
  2. package/README.template.md +5 -1
  3. package/dist/cli/commands/execution.js +8 -1
  4. package/dist/cli/core/browser.js +8 -3
  5. package/dist/cli/core/daemon/daemon.js +8 -6
  6. package/dist/cli/core/providers/kernel.js +107 -29
  7. package/dist/cli/core/providers/steel.js +10 -1
  8. package/dist/index.d.ts +3 -2
  9. package/dist/index.js +15 -1
  10. package/dist/runtime/recovery/agent.d.ts +50 -2
  11. package/dist/runtime/recovery/agent.js +159 -45
  12. package/dist/runtime/recovery/index.d.ts +2 -1
  13. package/dist/runtime/recovery/index.js +16 -2
  14. package/dist/runtime/recovery/page-fallbacks.d.ts +45 -0
  15. package/dist/runtime/recovery/page-fallbacks.js +389 -0
  16. package/dist/shared/state/index.d.ts +1 -1
  17. package/dist/shared/state/session-state.d.ts +4 -1
  18. package/dist/shared/state/session-state.js +2 -1
  19. package/dist/shared/workflow/workflow.d.ts +19 -6
  20. package/dist/shared/workflow/workflow.js +38 -9
  21. package/docs/reference/runtime/page-fallbacks.mdx +85 -0
  22. package/docs/understand-libretto/error-handling-and-recovery.mdx +45 -0
  23. package/package.json +4 -12
  24. package/skills/libretto/SKILL.md +8 -2
  25. package/skills/libretto/references/code-generation-rules.md +23 -6
  26. package/skills/libretto-readonly/SKILL.md +1 -1
  27. package/src/cli/commands/execution.ts +8 -1
  28. package/src/cli/core/browser.ts +7 -2
  29. package/src/cli/core/daemon/daemon.ts +9 -4
  30. package/src/cli/core/daemon/ipc.ts +1 -0
  31. package/src/cli/core/providers/kernel.ts +153 -29
  32. package/src/cli/core/providers/steel.ts +11 -1
  33. package/src/cli/core/providers/types.ts +3 -0
  34. package/src/index.ts +22 -2
  35. package/src/runtime/recovery/agent.ts +227 -50
  36. package/src/runtime/recovery/index.ts +21 -1
  37. package/src/runtime/recovery/page-fallbacks.ts +527 -0
  38. package/src/shared/state/index.ts +1 -0
  39. package/src/shared/state/session-state.ts +2 -0
  40. package/src/shared/workflow/workflow.ts +90 -20
@@ -5,7 +5,7 @@ import {
5
5
  } from "../../shared/logger/logger.js";
6
6
  import { generateObject, type LanguageModel } from "ai";
7
7
 
8
- type BrowserAction =
8
+ export type BrowserAction =
9
9
  | { type: "click"; x: number; y: number; button?: string }
10
10
  | { type: "double_click"; x: number; y: number }
11
11
  | {
@@ -23,6 +23,35 @@ type BrowserAction =
23
23
  | { type: "move"; x: number; y: number }
24
24
  | { type: "done" };
25
25
 
26
+ export type RecoveryAgentStep = {
27
+ step: number;
28
+ reasoning: string;
29
+ action: BrowserAction;
30
+ };
31
+
32
+ export type RecoveryAgentStatus =
33
+ | "skipped"
34
+ | "no-action-needed"
35
+ | "action-taken"
36
+ | "incomplete";
37
+
38
+ export type RecoveryAgentResult = {
39
+ status: RecoveryAgentStatus;
40
+ steps: RecoveryAgentStep[];
41
+ };
42
+
43
+ type ImageDimensions = {
44
+ width: number;
45
+ height: number;
46
+ };
47
+
48
+ type CoordinateScale = {
49
+ scaleX: number;
50
+ scaleY: number;
51
+ viewportWidth: number;
52
+ viewportHeight: number;
53
+ };
54
+
26
55
  function delay(ms: number): Promise<void> {
27
56
  return new Promise((resolve) => setTimeout(resolve, ms));
28
57
  }
@@ -57,6 +86,101 @@ function mapKeyName(key: string): string {
57
86
  return KEY_MAPPINGS[key.toUpperCase()] ?? key;
58
87
  }
59
88
 
89
+ function clamp(value: number, min: number, max: number): number {
90
+ return Math.min(Math.max(value, min), max);
91
+ }
92
+
93
+ function scalePoint(
94
+ x: number,
95
+ y: number,
96
+ scale: CoordinateScale,
97
+ ): { x: number; y: number } {
98
+ return {
99
+ x: clamp(x * scale.scaleX, 0, Math.max(scale.viewportWidth - 1, 0)),
100
+ y: clamp(y * scale.scaleY, 0, Math.max(scale.viewportHeight - 1, 0)),
101
+ };
102
+ }
103
+
104
+ function scaleBrowserAction(
105
+ action: BrowserAction,
106
+ scale: CoordinateScale,
107
+ ): BrowserAction {
108
+ switch (action.type) {
109
+ case "click": {
110
+ const point = scalePoint(action.x, action.y, scale);
111
+ return { ...action, ...point };
112
+ }
113
+ case "double_click": {
114
+ const point = scalePoint(action.x, action.y, scale);
115
+ return { ...action, ...point };
116
+ }
117
+ case "scroll": {
118
+ const point = scalePoint(action.x, action.y, scale);
119
+ return {
120
+ ...action,
121
+ ...point,
122
+ scroll_x: action.scroll_x * scale.scaleX,
123
+ scroll_y: action.scroll_y * scale.scaleY,
124
+ };
125
+ }
126
+ case "drag":
127
+ return {
128
+ ...action,
129
+ path: action.path.map((point) => scalePoint(point.x, point.y, scale)),
130
+ };
131
+ case "move": {
132
+ const point = scalePoint(action.x, action.y, scale);
133
+ return { ...action, ...point };
134
+ }
135
+ case "keypress":
136
+ case "type":
137
+ case "wait":
138
+ case "screenshot":
139
+ case "done":
140
+ return action;
141
+ }
142
+ }
143
+
144
+ function readPngDimensions(buffer: Buffer): ImageDimensions {
145
+ const pngSignature = "89504e470d0a1a0a";
146
+ if (buffer.subarray(0, 8).toString("hex") !== pngSignature) {
147
+ throw new Error("Recovery screenshot is not a PNG image.");
148
+ }
149
+
150
+ return {
151
+ width: buffer.readUInt32BE(16),
152
+ height: buffer.readUInt32BE(20),
153
+ };
154
+ }
155
+
156
+ async function takeViewportScreenshot(page: Page): Promise<{
157
+ screenshot: Buffer;
158
+ dimensions: ImageDimensions;
159
+ scale: CoordinateScale;
160
+ }> {
161
+ const viewport = page.viewportSize();
162
+ if (!viewport) {
163
+ throw new Error("Viewport size not found");
164
+ }
165
+
166
+ const screenshot = await page.screenshot({
167
+ fullPage: false,
168
+ scale: "css",
169
+ timeout: 10000,
170
+ });
171
+ const dimensions = readPngDimensions(screenshot);
172
+ return {
173
+ screenshot,
174
+ dimensions,
175
+ scale: {
176
+ scaleX: viewport.width / dimensions.width,
177
+ scaleY: viewport.height / dimensions.height,
178
+ viewportWidth: viewport.width,
179
+ viewportHeight: viewport.height,
180
+ },
181
+ };
182
+ }
183
+
60
184
  async function executeBrowserAction(
61
185
  page: Page,
62
186
  action: BrowserAction,
@@ -144,36 +268,81 @@ const recoveryActionSchema = z.object({
144
268
  reasoning: z
145
269
  .string()
146
270
  .describe("Your reasoning about what you see and what action to take"),
147
- action: z.discriminatedUnion("type", [
148
- z.object({
149
- type: z.literal("click"),
150
- x: z.number(),
151
- y: z.number(),
152
- }),
153
- z.object({
154
- type: z.literal("type"),
155
- text: z.string(),
156
- }),
157
- z.object({
158
- type: z.literal("keypress"),
159
- keys: z.array(z.string()),
160
- }),
161
- z.object({
162
- type: z.literal("scroll"),
163
- x: z.number(),
164
- y: z.number(),
165
- scroll_x: z.number(),
166
- scroll_y: z.number(),
167
- }),
168
- z.object({
169
- type: z.literal("wait"),
170
- }),
171
- z.object({
172
- type: z.literal("done"),
173
- }),
174
- ]),
271
+ action: z.object({
272
+ type: z
273
+ .enum(["click", "type", "keypress", "scroll", "wait", "done"])
274
+ .describe("The browser action to execute."),
275
+ x: z
276
+ .number()
277
+ .nullable()
278
+ .describe("The screenshot pixel x coordinate for click/scroll."),
279
+ y: z
280
+ .number()
281
+ .nullable()
282
+ .describe("The screenshot pixel y coordinate for click/scroll."),
283
+ text: z.string().nullable().describe("Text for type actions."),
284
+ keys: z
285
+ .array(z.string())
286
+ .nullable()
287
+ .describe("Keys for keypress actions."),
288
+ scroll_x: z.number().nullable().describe("Horizontal scroll delta."),
289
+ scroll_y: z.number().nullable().describe("Vertical scroll delta."),
290
+ }),
175
291
  });
176
292
 
293
+ function numberOrThrow(value: number | null, field: string): number {
294
+ if (typeof value === "number") return value;
295
+ throw new Error(`Recovery action is missing ${field}.`);
296
+ }
297
+
298
+ function normalizeRecoveryAction(
299
+ action: z.infer<typeof recoveryActionSchema>["action"],
300
+ ): BrowserAction {
301
+ switch (action.type) {
302
+ case "click":
303
+ return {
304
+ type: "click",
305
+ x: numberOrThrow(action.x, "x"),
306
+ y: numberOrThrow(action.y, "y"),
307
+ };
308
+ case "type":
309
+ return { type: "type", text: action.text ?? "" };
310
+ case "keypress":
311
+ return { type: "keypress", keys: action.keys ?? [] };
312
+ case "scroll":
313
+ return {
314
+ type: "scroll",
315
+ x: numberOrThrow(action.x, "x"),
316
+ y: numberOrThrow(action.y, "y"),
317
+ scroll_x: numberOrThrow(action.scroll_x, "scroll_x"),
318
+ scroll_y: numberOrThrow(action.scroll_y, "scroll_y"),
319
+ };
320
+ case "wait":
321
+ return { type: "wait" };
322
+ case "done":
323
+ return { type: "done" };
324
+ }
325
+ }
326
+
327
+ function getRecoveryStatus(steps: RecoveryAgentStep[]): RecoveryAgentStatus {
328
+ if (steps.length === 0) {
329
+ return "skipped";
330
+ }
331
+ const actionSteps = steps.filter((step) => step.action.type !== "done");
332
+ const completed = steps.at(-1)?.action.type === "done";
333
+ if (actionSteps.length === 0 && completed) {
334
+ return "no-action-needed";
335
+ }
336
+ if (completed) {
337
+ return "action-taken";
338
+ }
339
+ return "incomplete";
340
+ }
341
+
342
+ // A step is one screenshot -> model decision -> browser action cycle.
343
+ // Three covers common popup flows like close/confirm/done while bounding cost.
344
+ const DEFAULT_RECOVERY_MAX_STEPS = 3;
345
+
177
346
  /**
178
347
  * Executes a vision-based recovery agent to recover from browser automation failures.
179
348
  * Takes a screenshot, sends it to the LLM with the instruction, and executes
@@ -184,23 +353,17 @@ export async function executeRecoveryAgent(
184
353
  instruction: string,
185
354
  logger?: MinimalLogger,
186
355
  model?: LanguageModel,
187
- ): Promise<void> {
356
+ maxSteps = DEFAULT_RECOVERY_MAX_STEPS,
357
+ ): Promise<RecoveryAgentResult> {
188
358
  if (!model) {
189
- return;
359
+ return { status: "skipped", steps: [] };
190
360
  }
191
361
  const log = logger ?? defaultLogger;
192
362
  log.info("Executing vision-based recovery agent", { instruction });
193
363
 
194
- const viewport = page.viewportSize();
195
- if (!viewport) {
196
- throw new Error("Viewport size not found");
197
- }
198
-
199
- let screenshot: string;
364
+ let screenshotState: Awaited<ReturnType<typeof takeViewportScreenshot>>;
200
365
  try {
201
- screenshot = (
202
- await page.screenshot({ fullPage: false, timeout: 10000 })
203
- ).toString("base64");
366
+ screenshotState = await takeViewportScreenshot(page);
204
367
  } catch (screenshotError) {
205
368
  log.warn("Failed to take screenshot for recovery agent, skipping", {
206
369
  screenshotError:
@@ -211,8 +374,9 @@ export async function executeRecoveryAgent(
211
374
  throw new Error("Failed to take screenshot for recovery agent");
212
375
  }
213
376
 
214
- const maxSteps = 3;
377
+ const steps: RecoveryAgentStep[] = [];
215
378
  for (let step = 1; step <= maxSteps; step++) {
379
+ const { screenshot, dimensions, scale } = screenshotState;
216
380
  const { object: result } = await generateObject({
217
381
  model,
218
382
  schema: recoveryActionSchema,
@@ -226,12 +390,12 @@ export async function executeRecoveryAgent(
226
390
 
227
391
  Your task: ${instruction}
228
392
 
229
- Viewport: ${viewport.width}x${viewport.height}px. Complete this in as few steps as possible.
393
+ Screenshot: ${dimensions.width}x${dimensions.height}px. Coordinates must be screenshot pixel coordinates relative to the top-left corner of the screenshot. Complete this in as few steps as possible.
230
394
  Analyze the screenshot and decide what action to take. If the task is complete or no action is needed, use the "done" action type.`,
231
395
  },
232
396
  {
233
397
  type: "image",
234
- image: `data:image/png;base64,${screenshot}`,
398
+ image: screenshot,
235
399
  },
236
400
  ],
237
401
  },
@@ -239,24 +403,37 @@ Analyze the screenshot and decide what action to take. If the task is complete o
239
403
  temperature: 0,
240
404
  });
241
405
 
406
+ const imageAction = normalizeRecoveryAction(result.action);
407
+ const action = scaleBrowserAction(imageAction, scale);
242
408
  log.info(`Recovery step ${step}/${maxSteps}`, {
243
409
  reasoning: result.reasoning,
244
- action: result.action,
410
+ imageAction,
411
+ action,
412
+ screenshot: dimensions,
413
+ scale,
414
+ });
415
+ steps.push({
416
+ step,
417
+ reasoning: result.reasoning,
418
+ action,
245
419
  });
246
420
 
247
- if (result.action.type === "done") {
421
+ if (action.type === "done") {
248
422
  log.info("Recovery agent completed - no more actions needed");
249
423
  break;
250
424
  }
251
425
 
252
- await executeBrowserAction(page, result.action, log);
426
+ await executeBrowserAction(page, action, log);
253
427
  await delay(2000);
254
428
 
255
- // Take new screenshot for next iteration
256
- screenshot = (await page.screenshot({ fullPage: false })).toString(
257
- "base64",
258
- );
429
+ if (step < maxSteps) {
430
+ screenshotState = await takeViewportScreenshot(page);
431
+ }
259
432
  }
260
433
 
261
434
  log.info("Recovery agent execution completed");
435
+ return {
436
+ status: getRecoveryStatus(steps),
437
+ steps,
438
+ };
262
439
  }
@@ -1,7 +1,27 @@
1
- export { executeRecoveryAgent } from "./agent.js";
1
+ export {
2
+ executeRecoveryAgent,
3
+ type BrowserAction,
4
+ type RecoveryAgentResult,
5
+ type RecoveryAgentStep,
6
+ } from "./agent.js";
2
7
  export { attemptWithRecovery } from "./recovery.js";
3
8
  export {
4
9
  detectSubmissionError,
5
10
  type KnownSubmissionError,
6
11
  type DetectedSubmissionError,
7
12
  } from "./errors.js";
13
+ export {
14
+ COMPUTER_USE_RECOVERY_MODELS,
15
+ POPUP_RECOVERY_INSTRUCTION,
16
+ computerUseRecoveryAction,
17
+ createRecoveryPage,
18
+ popupRecoveryAction,
19
+ type ComputerUseRecoveryActionOptions,
20
+ type PopupRecoveryActionOptions,
21
+ type RecoveryActionContext,
22
+ type RecoveryAction,
23
+ type RecoveryActionHandler,
24
+ type RecoveryActionOptions,
25
+ type RecoveryActionResult,
26
+ type RecoveryActionTargetType,
27
+ } from "./page-fallbacks.js";