@onkernel/cua-agent 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,29 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.3.4 - 2026-06-23
4
+
5
+ - Add an opt-in `playwright` option to `CuaAgent` and `CuaAgentHarness` that
6
+ exposes a `playwright_execute` tool, running Playwright/TypeScript against
7
+ the live browser session via the Kernel SDK. Results, stdout, and stderr
8
+ come back as tool content; SDK-reported failures surface as content rather
9
+ than throwing. Adds the `PlaywrightDetails` export.
10
+
11
+ ## 0.3.3 - 2026-06-12
12
+
13
+ - The action translator now consumes the canonical `CuaAction` union with an
14
+ exhaustive switch. Malformed action shapes fail loudly instead of silently
15
+ coercing (previously e.g. a click at 0,0); the documented mouse-button
16
+ coercion to `"left"` is unchanged.
17
+ - `prepareNextTurn` no longer rebuilds the turn context on every turn: it
18
+ keeps stock pi behavior until a user hook returns an update or a mid-run
19
+ model assignment requires a refresh.
20
+ - One translator instance per runtime is shared between the executor tools
21
+ and the provider screenshot capability.
22
+ - The `CuaAgentHarness` README quickstart showcases session-backed turns and
23
+ mid-session model switching; `computerUseExtra` is documented with its
24
+ rationale.
25
+ - Update the `@onkernel/cua-ai` dependency to 0.3.0.
26
+
3
27
  ## 0.3.2 - 2026-06-11
4
28
 
5
29
  - Update the `@onkernel/cua-ai` dependency to 0.2.2.
package/README.md CHANGED
@@ -37,11 +37,17 @@ await agent.prompt("Open news.ycombinator.com and summarize the top story.");
37
37
 
38
38
  ## Quick Start (`CuaAgentHarness`)
39
39
 
40
+ `prompt()` returns the turn's final assistant message, and every turn is
41
+ persisted to the session — later prompts see the full transcript. Runtime
42
+ config like the model can change between turns (or even mid-turn, applying at
43
+ the next provider request):
44
+
40
45
  ```ts
41
46
  import { CuaAgentHarness, InMemorySessionRepo, NodeExecutionEnv } from "@onkernel/cua-agent";
47
+ import type { AssistantMessage } from "@onkernel/cua-ai";
42
48
 
43
49
  const sessionRepo = new InMemorySessionRepo();
44
- const session = await sessionRepo.create({ id: "example" });
50
+ const session = await sessionRepo.create({ id: "research" });
45
51
 
46
52
  const harness = new CuaAgentHarness({
47
53
  browser,
@@ -51,22 +57,26 @@ const harness = new CuaAgentHarness({
51
57
  session,
52
58
  });
53
59
 
54
- const response = await harness.prompt("Open example.com and tell me the current URL.");
55
- const branch = await session.getBranch();
56
- const lastAssistant = [...branch]
57
- .reverse()
58
- .flatMap((entry) =>
59
- entry.type === "message" && entry.message.role === "assistant" ? [entry.message] : [],
60
- )[0];
61
- const assistant = lastAssistant ?? response;
62
- const assistantText = assistant.content
63
- .flatMap((block) => (block.type === "text" ? [block.text] : []))
64
- .join("")
65
- .trim();
66
- console.log("assistant stopReason:", assistant.stopReason);
67
- console.log("assistant text:", assistantText || "(no text)");
60
+ const textOf = (message: AssistantMessage) =>
61
+ message.content.flatMap((block) => (block.type === "text" ? [block.text] : [])).join("").trim();
62
+
63
+ // Turn 1: a session-backed prompt.
64
+ const first = await harness.prompt("Open example.com and describe what you see.");
65
+ console.log(textOf(first));
66
+
67
+ // Swap providers mid-session; CUA tools and the default prompt refresh to match.
68
+ await harness.setModel("anthropic:claude-opus-4-7");
69
+
70
+ // Turn 2 continues the same transcript on the new model.
71
+ const second = await harness.prompt("Open the most relevant link from what you found.");
72
+ console.log(textOf(second));
68
73
  ```
69
74
 
75
+ While a turn is running, `steer()` injects course corrections, `followUp()`
76
+ queues the next instruction, and `subscribe()` streams the underlying agent
77
+ events. `compact()` and session branching are available for long-running
78
+ transcripts — see the pi-agent-core docs for the full harness lifecycle.
79
+
70
80
  Use `CuaAgent` when you want direct pi `Agent` control: raw message state,
71
81
  lifecycle events, custom streaming, and explicit prompt/continue/queue control.
72
82
  Reach for the harness shape when you want an app layer around the loop:
@@ -88,6 +98,8 @@ Both classes mirror pi constructor shapes and behavior, with minimal additions:
88
98
  - CUA model refs (`"provider:model"`) accepted where pi expects a concrete model
89
99
  - `extraTools` to add your own pi tools alongside the built-in browser tools
90
100
  - `computerUseExtra: true` to let the model use a small navigation helper
101
+ - `playwright: true` to let the model run Playwright/TypeScript against the
102
+ live browser session
91
103
 
92
104
  If auth callbacks are omitted, both classes default to CUA env var conventions:
93
105
  - OpenAI: `OPENAI_API_KEY`
@@ -108,9 +120,24 @@ computer-use tools. This is useful when the model needs to call
108
120
  application-specific code, such as looking up a record, writing a database row,
109
121
  or handing off to another service while it also controls the browser.
110
122
 
111
- `computerUseExtra: true` adds the `computer_use_extra` tool. Use it when you
112
- want one compact helper for common browser navigation/read operations:
113
- `goto`, `back`, `forward`, and `url`.
123
+ Not every provider's native computer-use vocabulary includes browser
124
+ navigation some models can click and type but have no direct way to open a
125
+ URL or go back. `computerUseExtra: true` adds `computer_use_extra`, a
126
+ provider-neutral escape hatch exposing `goto`, `back`, `forward`, and `url`
127
+ so navigation works uniformly regardless of which model is driving.
128
+
129
+ Some steps are awkward as raw pointer/keyboard actions: precise DOM reads,
130
+ form fills, data extraction, or waiting on a specific selector.
131
+ `playwright: true` adds `playwright_execute`, which runs Playwright/TypeScript
132
+ directly against the live browser session. `page`, `context`, and `browser`
133
+ are in scope and the code may `return` a JSON-serializable value. Each call
134
+ runs in a fresh JS context (locals don't persist across calls) but the
135
+ browser session does carry over. No screenshot is returned automatically;
136
+ request one on a follow-up turn when the model needs to see the page.
137
+ Playwright-level failures come back as tool content (so the model can adapt)
138
+ rather than thrown errors. Verified e2e
139
+ against Anthropic, Tzafon, and Yutori CUA models; OpenAI and Google are
140
+ unit-tested.
114
141
 
115
142
  ### Model Switching
116
143
 
package/dist/index.d.ts CHANGED
@@ -16,6 +16,7 @@ interface ComputerToolOptions {
16
16
  coordinateSystem?: ComputerToolCoordinateSystem;
17
17
  screenshot?: CuaScreenshotSpec;
18
18
  computerUseExtra?: boolean;
19
+ playwright?: boolean;
19
20
  }
20
21
  interface BatchDetails {
21
22
  statusText: string;
@@ -36,10 +37,34 @@ interface NavigationDetails {
36
37
  statusText: string;
37
38
  url?: string;
38
39
  }
40
+ /**
41
+ * Structured details for a `playwright_execute` tool result. Library
42
+ * consumers can read these directly instead of re-parsing the model-facing
43
+ * tool content blocks.
44
+ *
45
+ * - `success` — whether the Playwright code itself completed without error.
46
+ * A `false` value means the code threw or the SDK reported failure; in
47
+ * that case the failure is also surfaced as tool content for the model.
48
+ * - `statusText` — short human-readable status (success or failure summary).
49
+ * - `result` — present only when the code returned a JSON-serializable value.
50
+ * - `stdout`/`stderr` — raw daemon output, present whenever the daemon
51
+ * reported a non-empty value on that stream (may be whitespace-only).
52
+ * - `error` — present only when `success` is `false`; the error message from
53
+ * the daemon.
54
+ */
55
+ interface PlaywrightDetails {
56
+ success: boolean;
57
+ statusText: string;
58
+ result?: unknown;
59
+ stdout?: string;
60
+ stderr?: string;
61
+ error?: string;
62
+ }
39
63
  type BatchTool = AgentTool<TSchema, BatchDetails>;
40
64
  type NavigationTool = AgentTool<TSchema, NavigationDetails>;
65
+ type PlaywrightTool = AgentTool<TSchema, PlaywrightDetails>;
41
66
  type ActionTool = AgentTool<TSchema, BatchDetails>;
42
- type CuaExecutorTool = BatchTool | NavigationTool | ActionTool;
67
+ type CuaExecutorTool = BatchTool | NavigationTool | PlaywrightTool | ActionTool;
43
68
  declare function createCuaComputerTools(args: ComputerToolOptions): CuaExecutorTool[];
44
69
  //#endregion
45
70
  //#region src/agent.d.ts
@@ -74,7 +99,8 @@ type CuaAgentOptions = Omit<AgentOptions, "initialState"> & {
74
99
  client: Kernel; /** Initial pi state plus a CUA-aware model value. */
75
100
  initialState: CuaAgentInitialState; /** Add your own pi tools alongside the built-in browser tools. */
76
101
  extraTools?: AgentTool[]; /** Expose a helper for browser navigation and URL reads. */
77
- computerUseExtra?: boolean;
102
+ computerUseExtra?: boolean; /** Expose a tool that runs Playwright code against the browser session. */
103
+ playwright?: boolean;
78
104
  };
79
105
  /**
80
106
  * Constructor options for {@link CuaAgentHarness}.
@@ -88,7 +114,8 @@ type CuaAgentHarnessOptions<TSkill extends Skill = Skill, TPromptTemplate extend
88
114
  client: Kernel; /** Model used by the harness. CUA refs are resolved before pi sees the model. */
89
115
  model: CuaRuntimeInput; /** Add your own pi tools alongside the built-in browser tools. */
90
116
  extraTools?: AgentTool[]; /** Expose a helper for browser navigation and URL reads. */
91
- computerUseExtra?: boolean; /** Optional payload hook composed after the provider-specific CUA payload hook. */
117
+ computerUseExtra?: boolean; /** Expose a tool that runs Playwright code against the browser session. */
118
+ playwright?: boolean; /** Optional payload hook composed after the provider-specific CUA payload hook. */
92
119
  onPayload?: SimpleStreamOptions["onPayload"];
93
120
  };
94
121
  /**
@@ -102,7 +129,9 @@ type CuaAgentHarnessOptions<TSkill extends Skill = Skill, TPromptTemplate extend
102
129
  declare class CuaAgent extends Agent {
103
130
  private readonly runtime;
104
131
  private readonly ownsSystemPrompt;
132
+ private runtimeDirty;
105
133
  private stateProxy?;
134
+ private stateProxyTarget?;
106
135
  constructor(options: CuaAgentOptions);
107
136
  /**
108
137
  * Return a state proxy so `agent.state.model = "provider:model"` can behave
@@ -135,4 +164,4 @@ declare class CuaAgentHarness<TSkill extends Skill = Skill, TPromptTemplate exte
135
164
  setActiveTools(toolNames: string[]): Promise<void>;
136
165
  }
137
166
  //#endregion
138
- export { type BatchDetails, type ComputerToolOptions, CuaAgent, CuaAgentHarness, type CuaAgentHarnessOptions, type CuaAgentOptions, type CuaAgentState, type CuaExecutorTool, type KernelBrowser, type NavigationDetails, NodeExecutionEnv, createCuaComputerTools };
167
+ export { type BatchDetails, type ComputerToolOptions, CuaAgent, CuaAgentHarness, type CuaAgentHarnessOptions, type CuaAgentOptions, type CuaAgentState, type CuaExecutorTool, type KernelBrowser, type NavigationDetails, NodeExecutionEnv, type PlaywrightDetails, createCuaComputerTools };
package/dist/index.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import { Agent, AgentHarness } from "@earendil-works/pi-agent-core";
2
2
  import { NodeExecutionEnv } from "@earendil-works/pi-agent-core/node";
3
- import { CUA_NAVIGATION_TOOL_NAME, createCuaNavigationToolDefinition, getCuaEnvApiKey, normalizeGotoUrl, resolveCuaRuntimeSpec, streamSimple } from "@onkernel/cua-ai";
3
+ import { CUA_NAVIGATION_TOOL_NAME, CUA_PLAYWRIGHT_TOOL_NAME, createCuaNavigationToolDefinition, createCuaPlaywrightToolDefinition, getCuaEnvApiKey, normalizeGotoUrl, resolveCuaRuntimeSpec, streamSimple } from "@onkernel/cua-ai";
4
4
  import sharp from "sharp";
5
5
  export * from "@earendil-works/pi-agent-core";
6
6
  //#region src/translator/keys.ts
@@ -156,10 +156,18 @@ var InternalComputerTranslator = class {
156
156
  async currentMousePosition() {
157
157
  const pos = await this.client.browsers.computer.getMousePosition(this.sessionId);
158
158
  return {
159
- x: toInt(pos.x),
160
- y: toInt(pos.y)
159
+ x: Math.trunc(pos.x),
160
+ y: Math.trunc(pos.y)
161
161
  };
162
162
  }
163
+ async executePlaywright(code, timeoutSec) {
164
+ const truncated = timeoutSec !== void 0 ? Math.trunc(timeoutSec) : void 0;
165
+ const timeout = truncated !== void 0 && truncated >= 1 ? Math.min(truncated, PLAYWRIGHT_MAX_TIMEOUT_SEC) : void 0;
166
+ return this.client.browsers.playwright.execute(this.sessionId, {
167
+ code,
168
+ ...timeout !== void 0 ? { timeout_sec: timeout } : {}
169
+ });
170
+ }
163
171
  async executeBatch(actions) {
164
172
  const result = { readResults: [] };
165
173
  const pending = [];
@@ -167,248 +175,210 @@ var InternalComputerTranslator = class {
167
175
  if (pending.length === 0) return;
168
176
  await this.runKernelBatch(pending.splice(0));
169
177
  };
170
- for (let i = 0; i < actions.length; i++) {
171
- const action = actions[i];
172
- const type = typeof action.type === "string" ? action.type : "";
173
- if (type === "screenshot") {
178
+ for (const action of actions) switch (action.type) {
179
+ case "screenshot":
174
180
  await flush();
175
181
  result.readResults.push({
176
182
  type: "screenshot",
177
183
  ...await this.screenshot()
178
184
  });
179
- continue;
180
- }
181
- if (type === "url") {
185
+ break;
186
+ case "url":
182
187
  await flush();
183
188
  result.readResults.push({
184
189
  type: "url",
185
190
  url: await this.currentUrl()
186
191
  });
187
- continue;
188
- }
189
- if (type === "cursor_position") {
192
+ break;
193
+ case "cursor_position":
190
194
  await flush();
191
- const pos = await this.currentMousePosition();
192
195
  result.readResults.push({
193
196
  type: "cursor_position",
194
- ...pos
197
+ ...await this.currentMousePosition()
195
198
  });
196
- continue;
197
- }
198
- if (type === "goto") {
199
- const url = normalizeGotoUrl(action.url) ?? "";
199
+ break;
200
+ case "goto":
200
201
  pending.push(keypress(["Control", "l"]), {
201
202
  type: "type_text",
202
- type_text: { text: url }
203
+ type_text: { text: normalizeGotoUrl(action.url) ?? "" }
203
204
  }, keypress(["Enter"]));
204
- continue;
205
- }
206
- if (type === "back") {
205
+ break;
206
+ case "back":
207
207
  pending.push(keypress(["Alt", "Left"]));
208
- continue;
209
- }
210
- if (type === "forward") {
208
+ break;
209
+ case "forward":
211
210
  pending.push(keypress(["Alt", "Right"]));
212
- continue;
213
- }
214
- pending.push(toSdkAction(type, action, this.coordinateSystem, this.viewport));
211
+ break;
212
+ default:
213
+ pending.push(this.toSdkAction(action));
214
+ break;
215
215
  }
216
216
  await flush();
217
217
  return result;
218
218
  }
219
- async runKernelBatch(actions) {
220
- await this.client.browsers.computer.batch(this.sessionId, { actions });
221
- }
222
- };
223
- function toSdkAction(type, action, coordinateSystem, viewport) {
224
- switch (type) {
225
- case "click": {
226
- const clickHoldKeys = readHoldKeys(action.hold_keys);
227
- const point = toViewportPoint(action, coordinateSystem, viewport);
228
- return {
229
- type: "click_mouse",
230
- click_mouse: {
231
- x: point.x,
232
- y: point.y,
233
- button: clickMouseButtonOr(action.button, "left"),
234
- ...clickHoldKeys.length > 0 ? { hold_keys: clickHoldKeys } : {}
235
- }
236
- };
237
- }
238
- case "double_click": {
239
- const doubleClickHoldKeys = readHoldKeys(action.hold_keys);
240
- const point = toViewportPoint(action, coordinateSystem, viewport);
241
- return {
242
- type: "click_mouse",
243
- click_mouse: {
244
- x: point.x,
245
- y: point.y,
246
- num_clicks: 2,
247
- ...doubleClickHoldKeys.length > 0 ? { hold_keys: doubleClickHoldKeys } : {}
248
- }
249
- };
250
- }
251
- case "mouse_down":
252
- case "mouse_up": {
253
- const mouseHoldKeys = readHoldKeys(action.hold_keys);
254
- const point = toViewportPoint(action, coordinateSystem, viewport);
255
- return {
256
- type: "click_mouse",
257
- click_mouse: {
258
- x: point.x,
259
- y: point.y,
260
- button: clickMouseButtonOr(action.button, "left"),
261
- click_type: type === "mouse_down" ? "down" : "up",
262
- ...mouseHoldKeys.length > 0 ? { hold_keys: mouseHoldKeys } : {}
263
- }
264
- };
219
+ toSdkAction(action) {
220
+ switch (action.type) {
221
+ case "click": return this.clickAction(action, { button: mouseButton(action.button) });
222
+ case "double_click": return this.clickAction(action, { num_clicks: 2 });
223
+ case "mouse_down": return this.clickAction(action, {
224
+ button: mouseButton(action.button),
225
+ click_type: "down"
226
+ });
227
+ case "mouse_up": return this.clickAction(action, {
228
+ button: mouseButton(action.button),
229
+ click_type: "up"
230
+ });
231
+ case "type": return typeText(action);
232
+ case "keypress": return keypress(action.keys, action.duration);
233
+ case "scroll": return this.scrollAction(action);
234
+ case "move": return this.moveAction(action);
235
+ case "drag": return this.dragAction(action);
236
+ case "wait": return waitAction(action);
237
+ default: return unreachable(action);
265
238
  }
266
- case "type": return {
267
- type: "type_text",
268
- type_text: { text: typeof action.text === "string" ? action.text : "" }
239
+ }
240
+ clickAction(action, extra) {
241
+ const point = this.toViewportPoint(action.x, action.y);
242
+ return {
243
+ type: "click_mouse",
244
+ click_mouse: {
245
+ x: point.x,
246
+ y: point.y,
247
+ ...extra,
248
+ ...holdKeys(action.hold_keys)
249
+ }
269
250
  };
270
- case "keypress": return keypress(toStringArray(action.keys), action.duration);
271
- case "scroll": {
272
- const scrollHoldKeys = readHoldKeys(action.hold_keys);
273
- const point = toViewportPoint(action, coordinateSystem, viewport);
274
- return {
275
- type: "scroll",
276
- scroll: {
277
- x: point.x,
278
- y: point.y,
279
- delta_x: toInt(action.scroll_x),
280
- delta_y: toInt(action.scroll_y),
281
- ...scrollHoldKeys.length > 0 ? { hold_keys: scrollHoldKeys } : {}
282
- }
283
- };
284
- }
285
- case "move": {
286
- const moveHoldKeys = readHoldKeys(action.hold_keys);
287
- const point = toViewportPoint(action, coordinateSystem, viewport);
288
- return {
289
- type: "move_mouse",
290
- move_mouse: {
291
- x: point.x,
292
- y: point.y,
293
- ...moveHoldKeys.length > 0 ? { hold_keys: moveHoldKeys } : {}
294
- }
295
- };
296
- }
297
- case "drag": {
298
- const dragHoldKeys = readHoldKeys(action.hold_keys);
299
- return {
300
- type: "drag_mouse",
301
- drag_mouse: {
302
- path: toPath(action.path, coordinateSystem, viewport),
303
- button: dragMouseButtonOr(action.button, "left"),
304
- ...dragHoldKeys.length > 0 ? { hold_keys: dragHoldKeys } : {}
305
- }
306
- };
307
- }
308
- case "wait": return {
309
- type: "sleep",
310
- sleep: { duration_ms: typeof action.ms === "number" ? Math.trunc(action.ms) : 1e3 }
251
+ }
252
+ scrollAction(action) {
253
+ const point = this.toViewportPoint(action.x ?? 0, action.y ?? 0);
254
+ return {
255
+ type: "scroll",
256
+ scroll: {
257
+ x: point.x,
258
+ y: point.y,
259
+ delta_x: Math.trunc(action.scroll_x ?? 0),
260
+ delta_y: Math.trunc(action.scroll_y ?? 0),
261
+ ...holdKeys(action.hold_keys)
262
+ }
311
263
  };
312
- default: throw new Error(`unknown computer action type: ${type}`);
313
264
  }
314
- }
315
- function toInt(value) {
316
- if (typeof value === "number" && Number.isFinite(value)) return Math.trunc(value);
317
- if (typeof value === "string" && value.trim()) {
318
- const n = Number(value);
319
- if (Number.isFinite(n)) return Math.trunc(n);
265
+ moveAction(action) {
266
+ const point = this.toViewportPoint(action.x, action.y);
267
+ return {
268
+ type: "move_mouse",
269
+ move_mouse: {
270
+ x: point.x,
271
+ y: point.y
272
+ }
273
+ };
320
274
  }
321
- return 0;
322
- }
323
- function stringOr(value, fallback) {
324
- return typeof value === "string" && value.length > 0 ? value : fallback;
275
+ dragAction(action) {
276
+ return {
277
+ type: "drag_mouse",
278
+ drag_mouse: {
279
+ path: action.path.map((point) => {
280
+ const transformed = this.toViewportPoint(point.x, point.y);
281
+ return [transformed.x, transformed.y];
282
+ }),
283
+ button: dragButton(action.button),
284
+ ...holdKeys(action.hold_keys)
285
+ }
286
+ };
287
+ }
288
+ toViewportPoint(x, y) {
289
+ if (this.coordinateSystem.type === "pixel") return {
290
+ x: Math.trunc(x),
291
+ y: Math.trunc(y)
292
+ };
293
+ const [min, max] = this.coordinateSystem.range;
294
+ const scale = max - min;
295
+ if (scale <= 0) return {
296
+ x: Math.trunc(x),
297
+ y: Math.trunc(y)
298
+ };
299
+ return {
300
+ x: clamp(Math.round((x - min) / scale * this.viewport.width), 0, this.viewport.width - 1),
301
+ y: clamp(Math.round((y - min) / scale * this.viewport.height), 0, this.viewport.height - 1)
302
+ };
303
+ }
304
+ async runKernelBatch(actions) {
305
+ await this.client.browsers.computer.batch(this.sessionId, { actions });
306
+ }
307
+ };
308
+ const PLAYWRIGHT_MAX_TIMEOUT_SEC = 300;
309
+ const CLICK_BUTTONS = new Set([
310
+ "left",
311
+ "right",
312
+ "middle",
313
+ "back",
314
+ "forward"
315
+ ]);
316
+ const DRAG_BUTTONS = new Set([
317
+ "left",
318
+ "right",
319
+ "middle"
320
+ ]);
321
+ function mouseButton(value) {
322
+ return value !== void 0 && CLICK_BUTTONS.has(value) ? value : "left";
325
323
  }
326
- function clickMouseButtonOr(value, fallback) {
327
- const candidate = stringOr(value, fallback);
328
- if (candidate === "left" || candidate === "right" || candidate === "middle" || candidate === "back" || candidate === "forward") return candidate;
329
- return fallback;
324
+ function dragButton(value) {
325
+ return value !== void 0 && DRAG_BUTTONS.has(value) ? value : "left";
330
326
  }
331
- function dragMouseButtonOr(value, fallback) {
332
- const candidate = stringOr(value, fallback);
333
- if (candidate === "left" || candidate === "right" || candidate === "middle") return candidate;
334
- return fallback;
327
+ function typeText(action) {
328
+ return {
329
+ type: "type_text",
330
+ type_text: { text: action.text }
331
+ };
335
332
  }
336
- function toStringArray(value) {
337
- return Array.isArray(value) ? value.filter((item) => typeof item === "string") : [];
333
+ function waitAction(action) {
334
+ return {
335
+ type: "sleep",
336
+ sleep: { duration_ms: Math.trunc(action.ms ?? 1e3) }
337
+ };
338
338
  }
339
- function readHoldKeys(value) {
340
- return toStringArray(value).map(normalizeKernelKey);
339
+ function holdKeys(keys) {
340
+ if (!keys || keys.length === 0) return {};
341
+ return { hold_keys: keys.map(normalizeKernelKey) };
341
342
  }
342
- function keypress(keys, duration = void 0) {
343
+ function keypress(keys, duration) {
343
344
  const translated = keys.flatMap(normalizeKernelKeyCombo);
344
345
  const pressedKeys = translated.filter((key) => !isKernelModifierKey(key));
345
- const holdKeys = pressedKeys.length > 0 ? translated.filter(isKernelModifierKey) : translated.slice(0, -1);
346
+ const heldKeys = pressedKeys.length > 0 ? translated.filter(isKernelModifierKey) : translated.slice(0, -1);
346
347
  return {
347
348
  type: "press_key",
348
349
  press_key: {
349
350
  keys: pressedKeys.length > 0 ? pressedKeys : translated.slice(-1),
350
- ...holdKeys.length > 0 ? { hold_keys: holdKeys } : {},
351
+ ...heldKeys.length > 0 ? { hold_keys: heldKeys } : {},
351
352
  ...typeof duration === "number" && Number.isFinite(duration) && duration > 0 ? { duration: Math.trunc(duration) } : {}
352
353
  }
353
354
  };
354
355
  }
355
- function toPath(value, coordinateSystem = { type: "pixel" }, viewport = {
356
- width: 1920,
357
- height: 1080
358
- }) {
359
- if (!Array.isArray(value)) return [];
360
- return value.map((point) => toPathPoint(point, coordinateSystem, viewport));
361
- }
362
- function toPathPoint(value, coordinateSystem, viewport) {
363
- if (Array.isArray(value)) {
364
- const point = transformPoint(toInt(value[0]), toInt(value[1]), coordinateSystem, viewport);
365
- return [point.x, point.y];
366
- }
367
- if (value && typeof value === "object") {
368
- const point = value;
369
- const transformed = transformPoint(toInt(point.x), toInt(point.y), coordinateSystem, viewport);
370
- return [transformed.x, transformed.y];
371
- }
372
- return [0, 0];
373
- }
374
- function toViewportPoint(action, coordinateSystem, viewport) {
375
- return transformPoint(toInt(action.x), toInt(action.y), coordinateSystem, viewport);
376
- }
377
- function transformPoint(x, y, coordinateSystem, viewport) {
378
- if (coordinateSystem.type === "pixel") return {
379
- x,
380
- y
381
- };
382
- const [min, max] = coordinateSystem.range;
383
- const scale = max - min;
384
- if (scale <= 0) return {
385
- x,
386
- y
387
- };
388
- return {
389
- x: clamp(Math.round((x - min) / scale * viewport.width), 0, viewport.width - 1),
390
- y: clamp(Math.round((y - min) / scale * viewport.height), 0, viewport.height - 1)
391
- };
392
- }
393
356
  function clamp(value, min, max) {
394
357
  return Math.max(min, Math.min(max, value));
395
358
  }
359
+ function unreachable(action) {
360
+ throw new Error(`unknown computer action type: ${JSON.stringify(action)}`);
361
+ }
396
362
  //#endregion
397
363
  //#region src/tools.ts
398
364
  function createCuaComputerTools(args) {
399
- const translator = new InternalComputerTranslator(args);
400
- return withNavigationTool(args).map((executor) => createExecutorTool(executor, translator));
365
+ return buildCuaComputerTools(args, new InternalComputerTranslator(args));
366
+ }
367
+ /** Build executor tools against an existing translator (internal; not part of the package surface). */
368
+ function buildCuaComputerTools(args, translator) {
369
+ return withExtraTools(args).map((executor) => createExecutorTool(executor, translator));
401
370
  }
402
- function withNavigationTool(args) {
371
+ function withExtraTools(args) {
403
372
  const executors = [...args.toolExecutors];
404
373
  const existing = new Set(executors.map((executor) => executor.definition.name));
405
- if (args.computerUseExtra && !existing.has(CUA_NAVIGATION_TOOL_NAME)) {
406
- const definition = createCuaNavigationToolDefinition();
407
- executors.push({
408
- kind: "navigation",
409
- definition
410
- });
411
- }
374
+ if (args.computerUseExtra && !existing.has(CUA_NAVIGATION_TOOL_NAME)) executors.push({
375
+ kind: "navigation",
376
+ definition: createCuaNavigationToolDefinition()
377
+ });
378
+ if (args.playwright && !existing.has(CUA_PLAYWRIGHT_TOOL_NAME)) executors.push({
379
+ kind: "playwright",
380
+ definition: createCuaPlaywrightToolDefinition()
381
+ });
412
382
  return executors;
413
383
  }
414
384
  function createExecutorTool(executor, translator) {
@@ -422,6 +392,16 @@ function createExecutorTool(executor, translator) {
422
392
  return executeNavigationTool(translator, asNavigationInput(params));
423
393
  }
424
394
  };
395
+ if (isPlaywrightExecutor(executor)) return {
396
+ name: definition.name,
397
+ label: definition.name,
398
+ description: definition.description,
399
+ parameters: definition.parameters,
400
+ executionMode: "sequential",
401
+ async execute(_toolCallId, params) {
402
+ return executePlaywrightTool(translator, asPlaywrightInput(params));
403
+ }
404
+ };
425
405
  return {
426
406
  name: definition.name,
427
407
  label: definition.name,
@@ -436,6 +416,9 @@ function createExecutorTool(executor, translator) {
436
416
  function isNavigationExecutor(executor) {
437
417
  return "kind" in executor && executor.kind === "navigation";
438
418
  }
419
+ function isPlaywrightExecutor(executor) {
420
+ return "kind" in executor && executor.kind === "playwright";
421
+ }
439
422
  async function executeBatchTool(translator, params) {
440
423
  const content = [];
441
424
  const readResults = [];
@@ -502,10 +485,11 @@ async function executeNavigationTool(translator, params) {
502
485
  if (action === "url") {
503
486
  url = await translator.currentUrl();
504
487
  statusText = `Current URL: ${url}`;
505
- } else await translator.executeBatch([{
506
- type: action,
507
- url: params.url
488
+ } else if (action === "goto") await translator.executeBatch([{
489
+ type: "goto",
490
+ url: params.url ?? ""
508
491
  }]);
492
+ else await translator.executeBatch([{ type: action }]);
509
493
  const screenshot = await translator.screenshot();
510
494
  return {
511
495
  content: [{
@@ -526,6 +510,50 @@ async function executeNavigationTool(translator, params) {
526
510
  throw new Error(`${action} failed: ${errorMessage(err)}`, { cause: err });
527
511
  }
528
512
  }
513
+ async function executePlaywrightTool(translator, params) {
514
+ try {
515
+ const execution = await translator.executePlaywright(params.code, params.timeout_sec);
516
+ const content = [];
517
+ if (execution.result !== void 0) content.push({
518
+ type: "text",
519
+ text: `result: ${formatPlaywrightResult(execution.result)}`
520
+ });
521
+ if (execution.stdout?.trim()) content.push({
522
+ type: "text",
523
+ text: `stdout:\n${execution.stdout.trimEnd()}`
524
+ });
525
+ if (execution.stderr?.trim()) content.push({
526
+ type: "text",
527
+ text: `stderr:\n${execution.stderr.trimEnd()}`
528
+ });
529
+ if (!execution.success) content.push({
530
+ type: "text",
531
+ text: `error: ${execution.error ?? "playwright execution reported failure"}`
532
+ });
533
+ const statusText = execution.success ? "Playwright executed successfully." : `Playwright execution failed: ${execution.error ?? "unknown error"}`;
534
+ if (content.length === 0) content.push({
535
+ type: "text",
536
+ text: statusText
537
+ });
538
+ const details = {
539
+ success: execution.success,
540
+ statusText
541
+ };
542
+ if (execution.result !== void 0) details.result = execution.result;
543
+ if (execution.stdout) details.stdout = execution.stdout;
544
+ if (execution.stderr) details.stderr = execution.stderr;
545
+ if (execution.error) details.error = execution.error;
546
+ return {
547
+ content,
548
+ details
549
+ };
550
+ } catch (err) {
551
+ throw new Error(`playwright_execute failed: ${errorMessage(err)}`, { cause: err });
552
+ }
553
+ }
554
+ function formatPlaywrightResult(result) {
555
+ return typeof result === "string" ? result : JSON.stringify(result);
556
+ }
529
557
  function errorMessage(err) {
530
558
  return err instanceof Error ? err.message : String(err);
531
559
  }
@@ -533,6 +561,10 @@ function asNavigationInput(value) {
533
561
  if (value && typeof value === "object" && typeof value.action === "string") return value;
534
562
  throw new Error("invalid computer_use_extra parameters");
535
563
  }
564
+ function asPlaywrightInput(value) {
565
+ if (value && typeof value === "object" && typeof value.code === "string") return value;
566
+ throw new Error("invalid playwright_execute parameters");
567
+ }
536
568
  //#endregion
537
569
  //#region src/agent.ts
538
570
  /**
@@ -561,14 +593,11 @@ var CuaRuntimeController = class {
561
593
  this.translator = this.createTranslator();
562
594
  }
563
595
  tools() {
564
- return [...createCuaComputerTools({
565
- browser: this.options.browser,
566
- client: this.options.client,
596
+ return [...buildCuaComputerTools({
567
597
  toolExecutors: this.runtimeSpec.toolExecutors,
568
- coordinateSystem: this.runtimeSpec.coordinateSystem,
569
- screenshot: this.runtimeSpec.screenshot,
570
- computerUseExtra: this.options.computerUseExtra
571
- }), ...this.options.extraTools ?? []];
598
+ computerUseExtra: this.options.computerUseExtra,
599
+ playwright: this.options.playwright
600
+ }, this.translator), ...this.options.extraTools ?? []];
572
601
  }
573
602
  onPayload() {
574
603
  const runtimeSpec = this.runtimeSpec;
@@ -578,7 +607,11 @@ var CuaRuntimeController = class {
578
607
  }) : void 0, this.options.onPayload);
579
608
  }
580
609
  keepToolNames() {
581
- return [...(this.options.extraTools ?? []).map((tool) => tool.name), ...this.options.computerUseExtra ? [CUA_NAVIGATION_TOOL_NAME] : []];
610
+ return [
611
+ ...(this.options.extraTools ?? []).map((tool) => tool.name),
612
+ ...this.options.computerUseExtra ? [CUA_NAVIGATION_TOOL_NAME] : [],
613
+ ...this.options.playwright ? [CUA_PLAYWRIGHT_TOOL_NAME] : []
614
+ ];
582
615
  }
583
616
  createTranslator() {
584
617
  return new InternalComputerTranslator({
@@ -605,15 +638,18 @@ async function getCuaEnvApiKeyAndHeaders(model) {
605
638
  var CuaAgent = class extends Agent {
606
639
  runtime;
607
640
  ownsSystemPrompt;
641
+ runtimeDirty = false;
608
642
  stateProxy;
643
+ stateProxyTarget;
609
644
  constructor(options) {
610
- const { browser, client, initialState, onPayload, streamFn, prepareNextTurn, extraTools, computerUseExtra, ...agentOptions } = options;
645
+ const { browser, client, initialState, onPayload, streamFn, prepareNextTurn, extraTools, computerUseExtra, playwright, ...agentOptions } = options;
611
646
  const runtime = new CuaRuntimeController({
612
647
  browser,
613
648
  client,
614
649
  model: initialState.model,
615
650
  extraTools,
616
651
  computerUseExtra,
652
+ playwright,
617
653
  onPayload
618
654
  });
619
655
  const wrappedStreamFn = (model, context, streamOptions) => {
@@ -638,13 +674,17 @@ var CuaAgent = class extends Agent {
638
674
  this.runtime = runtime;
639
675
  this.ownsSystemPrompt = initialState.systemPrompt === void 0;
640
676
  /**
641
- * pi calls `prepareNextTurn` between provider requests. Wrapping it lets CUA
642
- * honor any user-provided turn update while also refreshing provider-specific
643
- * defaults if that update changes the model.
677
+ * pi's loop only re-reads model/tools/prompt between provider requests
678
+ * through `prepareNextTurn`. The wrapper stays pass-through (returning
679
+ * `undefined`, i.e. stock pi behavior) until either the user hook returns
680
+ * an update or a mid-run model assignment marks the CUA runtime dirty —
681
+ * only then is a turn update built from current state.
644
682
  */
645
683
  this.prepareNextTurn = async (signal) => {
646
684
  const update = await prepareNextTurn?.(signal);
647
685
  if (update?.model) this.applyRuntime(update.model);
686
+ if (!update && !this.runtimeDirty) return void 0;
687
+ this.runtimeDirty = false;
648
688
  const state = super.state;
649
689
  const context = update?.context ?? {
650
690
  systemPrompt: state.systemPrompt,
@@ -668,17 +708,22 @@ var CuaAgent = class extends Agent {
668
708
  * and payload hooks for the selected provider.
669
709
  */
670
710
  get state() {
671
- if (!this.stateProxy) this.stateProxy = new Proxy(super.state, { set: (target, prop, value, receiver) => {
672
- if (prop === "model") {
673
- this.applyRuntime(value);
674
- return true;
675
- }
676
- return Reflect.set(target, prop, value, receiver);
677
- } });
711
+ const target = super.state;
712
+ if (!this.stateProxy || this.stateProxyTarget !== target) {
713
+ this.stateProxyTarget = target;
714
+ this.stateProxy = new Proxy(target, { set: (proxied, prop, value, receiver) => {
715
+ if (prop === "model") {
716
+ this.applyRuntime(value);
717
+ return true;
718
+ }
719
+ return Reflect.set(proxied, prop, value, receiver);
720
+ } });
721
+ }
678
722
  return this.stateProxy;
679
723
  }
680
724
  applyRuntime(model) {
681
725
  this.runtime.setModel(model);
726
+ this.runtimeDirty = true;
682
727
  const state = super.state;
683
728
  state.model = this.runtime.model;
684
729
  state.tools = this.runtime.tools();
@@ -697,13 +742,14 @@ var CuaAgentHarness = class extends AgentHarness {
697
742
  runtime;
698
743
  requestedActiveToolNames;
699
744
  constructor(options) {
700
- const { browser, client, model, extraTools, computerUseExtra, systemPrompt, getApiKeyAndHeaders, onPayload, activeToolNames, ...harnessOptions } = options;
745
+ const { browser, client, model, extraTools, computerUseExtra, playwright, systemPrompt, getApiKeyAndHeaders, onPayload, activeToolNames, ...harnessOptions } = options;
701
746
  const runtime = new CuaRuntimeController({
702
747
  browser,
703
748
  client,
704
749
  model,
705
750
  extraTools,
706
751
  computerUseExtra,
752
+ playwright,
707
753
  onPayload
708
754
  });
709
755
  const resolvedTools = runtime.tools();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@onkernel/cua-agent",
3
- "version": "0.3.2",
3
+ "version": "0.3.4",
4
4
  "description": "Kernel browser computer-use Agent and AgentHarness classes built on pi-agent-core",
5
5
  "license": "MIT",
6
6
  "type": "module",
@@ -42,7 +42,7 @@
42
42
  "dependencies": {
43
43
  "@earendil-works/pi-agent-core": "0.79.1",
44
44
  "@earendil-works/pi-ai": "0.79.1",
45
- "@onkernel/cua-ai": "0.2.2",
45
+ "@onkernel/cua-ai": "0.3.1",
46
46
  "@onkernel/sdk": "0.49.0",
47
47
  "sharp": "^0.34.5"
48
48
  },