@onkernel/cua-agent 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +24 -0
- package/README.md +45 -18
- package/dist/index.d.ts +33 -4
- package/dist/index.js +262 -216
- package/package.json +2 -2
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,29 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.3.4 - 2026-06-23
|
|
4
|
+
|
|
5
|
+
- Add an opt-in `playwright` option to `CuaAgent` and `CuaAgentHarness` that
|
|
6
|
+
exposes a `playwright_execute` tool, running Playwright/TypeScript against
|
|
7
|
+
the live browser session via the Kernel SDK. Results, stdout, and stderr
|
|
8
|
+
come back as tool content; SDK-reported failures surface as content rather
|
|
9
|
+
than throwing. Adds the `PlaywrightDetails` export.
|
|
10
|
+
|
|
11
|
+
## 0.3.3 - 2026-06-12
|
|
12
|
+
|
|
13
|
+
- The action translator now consumes the canonical `CuaAction` union with an
|
|
14
|
+
exhaustive switch. Malformed action shapes fail loudly instead of silently
|
|
15
|
+
coercing (previously e.g. a click at 0,0); the documented mouse-button
|
|
16
|
+
coercion to `"left"` is unchanged.
|
|
17
|
+
- `prepareNextTurn` no longer rebuilds the turn context on every turn: it
|
|
18
|
+
keeps stock pi behavior until a user hook returns an update or a mid-run
|
|
19
|
+
model assignment requires a refresh.
|
|
20
|
+
- One translator instance per runtime is shared between the executor tools
|
|
21
|
+
and the provider screenshot capability.
|
|
22
|
+
- The `CuaAgentHarness` README quickstart showcases session-backed turns and
|
|
23
|
+
mid-session model switching; `computerUseExtra` is documented with its
|
|
24
|
+
rationale.
|
|
25
|
+
- Update the `@onkernel/cua-ai` dependency to 0.3.0.
|
|
26
|
+
|
|
3
27
|
## 0.3.2 - 2026-06-11
|
|
4
28
|
|
|
5
29
|
- Update the `@onkernel/cua-ai` dependency to 0.2.2.
|
package/README.md
CHANGED
|
@@ -37,11 +37,17 @@ await agent.prompt("Open news.ycombinator.com and summarize the top story.");
|
|
|
37
37
|
|
|
38
38
|
## Quick Start (`CuaAgentHarness`)
|
|
39
39
|
|
|
40
|
+
`prompt()` returns the turn's final assistant message, and every turn is
|
|
41
|
+
persisted to the session — later prompts see the full transcript. Runtime
|
|
42
|
+
config like the model can change between turns (or even mid-turn, applying at
|
|
43
|
+
the next provider request):
|
|
44
|
+
|
|
40
45
|
```ts
|
|
41
46
|
import { CuaAgentHarness, InMemorySessionRepo, NodeExecutionEnv } from "@onkernel/cua-agent";
|
|
47
|
+
import type { AssistantMessage } from "@onkernel/cua-ai";
|
|
42
48
|
|
|
43
49
|
const sessionRepo = new InMemorySessionRepo();
|
|
44
|
-
const session = await sessionRepo.create({ id: "
|
|
50
|
+
const session = await sessionRepo.create({ id: "research" });
|
|
45
51
|
|
|
46
52
|
const harness = new CuaAgentHarness({
|
|
47
53
|
browser,
|
|
@@ -51,22 +57,26 @@ const harness = new CuaAgentHarness({
|
|
|
51
57
|
session,
|
|
52
58
|
});
|
|
53
59
|
|
|
54
|
-
const
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
console.log(
|
|
67
|
-
console.log("assistant text:", assistantText || "(no text)");
|
|
60
|
+
const textOf = (message: AssistantMessage) =>
|
|
61
|
+
message.content.flatMap((block) => (block.type === "text" ? [block.text] : [])).join("").trim();
|
|
62
|
+
|
|
63
|
+
// Turn 1: a session-backed prompt.
|
|
64
|
+
const first = await harness.prompt("Open example.com and describe what you see.");
|
|
65
|
+
console.log(textOf(first));
|
|
66
|
+
|
|
67
|
+
// Swap providers mid-session; CUA tools and the default prompt refresh to match.
|
|
68
|
+
await harness.setModel("anthropic:claude-opus-4-7");
|
|
69
|
+
|
|
70
|
+
// Turn 2 continues the same transcript on the new model.
|
|
71
|
+
const second = await harness.prompt("Open the most relevant link from what you found.");
|
|
72
|
+
console.log(textOf(second));
|
|
68
73
|
```
|
|
69
74
|
|
|
75
|
+
While a turn is running, `steer()` injects course corrections, `followUp()`
|
|
76
|
+
queues the next instruction, and `subscribe()` streams the underlying agent
|
|
77
|
+
events. `compact()` and session branching are available for long-running
|
|
78
|
+
transcripts — see the pi-agent-core docs for the full harness lifecycle.
|
|
79
|
+
|
|
70
80
|
Use `CuaAgent` when you want direct pi `Agent` control: raw message state,
|
|
71
81
|
lifecycle events, custom streaming, and explicit prompt/continue/queue control.
|
|
72
82
|
Reach for the harness shape when you want an app layer around the loop:
|
|
@@ -88,6 +98,8 @@ Both classes mirror pi constructor shapes and behavior, with minimal additions:
|
|
|
88
98
|
- CUA model refs (`"provider:model"`) accepted where pi expects a concrete model
|
|
89
99
|
- `extraTools` to add your own pi tools alongside the built-in browser tools
|
|
90
100
|
- `computerUseExtra: true` to let the model use a small navigation helper
|
|
101
|
+
- `playwright: true` to let the model run Playwright/TypeScript against the
|
|
102
|
+
live browser session
|
|
91
103
|
|
|
92
104
|
If auth callbacks are omitted, both classes default to CUA env var conventions:
|
|
93
105
|
- OpenAI: `OPENAI_API_KEY`
|
|
@@ -108,9 +120,24 @@ computer-use tools. This is useful when the model needs to call
|
|
|
108
120
|
application-specific code, such as looking up a record, writing a database row,
|
|
109
121
|
or handing off to another service while it also controls the browser.
|
|
110
122
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
`
|
|
123
|
+
Not every provider's native computer-use vocabulary includes browser
|
|
124
|
+
navigation — some models can click and type but have no direct way to open a
|
|
125
|
+
URL or go back. `computerUseExtra: true` adds `computer_use_extra`, a
|
|
126
|
+
provider-neutral escape hatch exposing `goto`, `back`, `forward`, and `url`
|
|
127
|
+
so navigation works uniformly regardless of which model is driving.
|
|
128
|
+
|
|
129
|
+
Some steps are awkward as raw pointer/keyboard actions: precise DOM reads,
|
|
130
|
+
form fills, data extraction, or waiting on a specific selector.
|
|
131
|
+
`playwright: true` adds `playwright_execute`, which runs Playwright/TypeScript
|
|
132
|
+
directly against the live browser session. `page`, `context`, and `browser`
|
|
133
|
+
are in scope and the code may `return` a JSON-serializable value. Each call
|
|
134
|
+
runs in a fresh JS context (locals don't persist across calls) but the
|
|
135
|
+
browser session does carry over. No screenshot is returned automatically;
|
|
136
|
+
request one on a follow-up turn when the model needs to see the page.
|
|
137
|
+
Playwright-level failures come back as tool content (so the model can adapt)
|
|
138
|
+
rather than thrown errors. Verified e2e
|
|
139
|
+
against Anthropic, Tzafon, and Yutori CUA models; OpenAI and Google are
|
|
140
|
+
unit-tested.
|
|
114
141
|
|
|
115
142
|
### Model Switching
|
|
116
143
|
|
package/dist/index.d.ts
CHANGED
|
@@ -16,6 +16,7 @@ interface ComputerToolOptions {
|
|
|
16
16
|
coordinateSystem?: ComputerToolCoordinateSystem;
|
|
17
17
|
screenshot?: CuaScreenshotSpec;
|
|
18
18
|
computerUseExtra?: boolean;
|
|
19
|
+
playwright?: boolean;
|
|
19
20
|
}
|
|
20
21
|
interface BatchDetails {
|
|
21
22
|
statusText: string;
|
|
@@ -36,10 +37,34 @@ interface NavigationDetails {
|
|
|
36
37
|
statusText: string;
|
|
37
38
|
url?: string;
|
|
38
39
|
}
|
|
40
|
+
/**
|
|
41
|
+
* Structured details for a `playwright_execute` tool result. Library
|
|
42
|
+
* consumers can read these directly instead of re-parsing the model-facing
|
|
43
|
+
* tool content blocks.
|
|
44
|
+
*
|
|
45
|
+
* - `success` — whether the Playwright code itself completed without error.
|
|
46
|
+
* A `false` value means the code threw or the SDK reported failure; in
|
|
47
|
+
* that case the failure is also surfaced as tool content for the model.
|
|
48
|
+
* - `statusText` — short human-readable status (success or failure summary).
|
|
49
|
+
* - `result` — present only when the code returned a JSON-serializable value.
|
|
50
|
+
* - `stdout`/`stderr` — raw daemon output, present whenever the daemon
|
|
51
|
+
* reported a non-empty value on that stream (may be whitespace-only).
|
|
52
|
+
* - `error` — present only when `success` is `false`; the error message from
|
|
53
|
+
* the daemon.
|
|
54
|
+
*/
|
|
55
|
+
interface PlaywrightDetails {
|
|
56
|
+
success: boolean;
|
|
57
|
+
statusText: string;
|
|
58
|
+
result?: unknown;
|
|
59
|
+
stdout?: string;
|
|
60
|
+
stderr?: string;
|
|
61
|
+
error?: string;
|
|
62
|
+
}
|
|
39
63
|
type BatchTool = AgentTool<TSchema, BatchDetails>;
|
|
40
64
|
type NavigationTool = AgentTool<TSchema, NavigationDetails>;
|
|
65
|
+
type PlaywrightTool = AgentTool<TSchema, PlaywrightDetails>;
|
|
41
66
|
type ActionTool = AgentTool<TSchema, BatchDetails>;
|
|
42
|
-
type CuaExecutorTool = BatchTool | NavigationTool | ActionTool;
|
|
67
|
+
type CuaExecutorTool = BatchTool | NavigationTool | PlaywrightTool | ActionTool;
|
|
43
68
|
declare function createCuaComputerTools(args: ComputerToolOptions): CuaExecutorTool[];
|
|
44
69
|
//#endregion
|
|
45
70
|
//#region src/agent.d.ts
|
|
@@ -74,7 +99,8 @@ type CuaAgentOptions = Omit<AgentOptions, "initialState"> & {
|
|
|
74
99
|
client: Kernel; /** Initial pi state plus a CUA-aware model value. */
|
|
75
100
|
initialState: CuaAgentInitialState; /** Add your own pi tools alongside the built-in browser tools. */
|
|
76
101
|
extraTools?: AgentTool[]; /** Expose a helper for browser navigation and URL reads. */
|
|
77
|
-
computerUseExtra?: boolean;
|
|
102
|
+
computerUseExtra?: boolean; /** Expose a tool that runs Playwright code against the browser session. */
|
|
103
|
+
playwright?: boolean;
|
|
78
104
|
};
|
|
79
105
|
/**
|
|
80
106
|
* Constructor options for {@link CuaAgentHarness}.
|
|
@@ -88,7 +114,8 @@ type CuaAgentHarnessOptions<TSkill extends Skill = Skill, TPromptTemplate extend
|
|
|
88
114
|
client: Kernel; /** Model used by the harness. CUA refs are resolved before pi sees the model. */
|
|
89
115
|
model: CuaRuntimeInput; /** Add your own pi tools alongside the built-in browser tools. */
|
|
90
116
|
extraTools?: AgentTool[]; /** Expose a helper for browser navigation and URL reads. */
|
|
91
|
-
computerUseExtra?: boolean; /**
|
|
117
|
+
computerUseExtra?: boolean; /** Expose a tool that runs Playwright code against the browser session. */
|
|
118
|
+
playwright?: boolean; /** Optional payload hook composed after the provider-specific CUA payload hook. */
|
|
92
119
|
onPayload?: SimpleStreamOptions["onPayload"];
|
|
93
120
|
};
|
|
94
121
|
/**
|
|
@@ -102,7 +129,9 @@ type CuaAgentHarnessOptions<TSkill extends Skill = Skill, TPromptTemplate extend
|
|
|
102
129
|
declare class CuaAgent extends Agent {
|
|
103
130
|
private readonly runtime;
|
|
104
131
|
private readonly ownsSystemPrompt;
|
|
132
|
+
private runtimeDirty;
|
|
105
133
|
private stateProxy?;
|
|
134
|
+
private stateProxyTarget?;
|
|
106
135
|
constructor(options: CuaAgentOptions);
|
|
107
136
|
/**
|
|
108
137
|
* Return a state proxy so `agent.state.model = "provider:model"` can behave
|
|
@@ -135,4 +164,4 @@ declare class CuaAgentHarness<TSkill extends Skill = Skill, TPromptTemplate exte
|
|
|
135
164
|
setActiveTools(toolNames: string[]): Promise<void>;
|
|
136
165
|
}
|
|
137
166
|
//#endregion
|
|
138
|
-
export { type BatchDetails, type ComputerToolOptions, CuaAgent, CuaAgentHarness, type CuaAgentHarnessOptions, type CuaAgentOptions, type CuaAgentState, type CuaExecutorTool, type KernelBrowser, type NavigationDetails, NodeExecutionEnv, createCuaComputerTools };
|
|
167
|
+
export { type BatchDetails, type ComputerToolOptions, CuaAgent, CuaAgentHarness, type CuaAgentHarnessOptions, type CuaAgentOptions, type CuaAgentState, type CuaExecutorTool, type KernelBrowser, type NavigationDetails, NodeExecutionEnv, type PlaywrightDetails, createCuaComputerTools };
|
package/dist/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { Agent, AgentHarness } from "@earendil-works/pi-agent-core";
|
|
2
2
|
import { NodeExecutionEnv } from "@earendil-works/pi-agent-core/node";
|
|
3
|
-
import { CUA_NAVIGATION_TOOL_NAME, createCuaNavigationToolDefinition, getCuaEnvApiKey, normalizeGotoUrl, resolveCuaRuntimeSpec, streamSimple } from "@onkernel/cua-ai";
|
|
3
|
+
import { CUA_NAVIGATION_TOOL_NAME, CUA_PLAYWRIGHT_TOOL_NAME, createCuaNavigationToolDefinition, createCuaPlaywrightToolDefinition, getCuaEnvApiKey, normalizeGotoUrl, resolveCuaRuntimeSpec, streamSimple } from "@onkernel/cua-ai";
|
|
4
4
|
import sharp from "sharp";
|
|
5
5
|
export * from "@earendil-works/pi-agent-core";
|
|
6
6
|
//#region src/translator/keys.ts
|
|
@@ -156,10 +156,18 @@ var InternalComputerTranslator = class {
|
|
|
156
156
|
async currentMousePosition() {
|
|
157
157
|
const pos = await this.client.browsers.computer.getMousePosition(this.sessionId);
|
|
158
158
|
return {
|
|
159
|
-
x:
|
|
160
|
-
y:
|
|
159
|
+
x: Math.trunc(pos.x),
|
|
160
|
+
y: Math.trunc(pos.y)
|
|
161
161
|
};
|
|
162
162
|
}
|
|
163
|
+
async executePlaywright(code, timeoutSec) {
|
|
164
|
+
const truncated = timeoutSec !== void 0 ? Math.trunc(timeoutSec) : void 0;
|
|
165
|
+
const timeout = truncated !== void 0 && truncated >= 1 ? Math.min(truncated, PLAYWRIGHT_MAX_TIMEOUT_SEC) : void 0;
|
|
166
|
+
return this.client.browsers.playwright.execute(this.sessionId, {
|
|
167
|
+
code,
|
|
168
|
+
...timeout !== void 0 ? { timeout_sec: timeout } : {}
|
|
169
|
+
});
|
|
170
|
+
}
|
|
163
171
|
async executeBatch(actions) {
|
|
164
172
|
const result = { readResults: [] };
|
|
165
173
|
const pending = [];
|
|
@@ -167,248 +175,210 @@ var InternalComputerTranslator = class {
|
|
|
167
175
|
if (pending.length === 0) return;
|
|
168
176
|
await this.runKernelBatch(pending.splice(0));
|
|
169
177
|
};
|
|
170
|
-
for (
|
|
171
|
-
|
|
172
|
-
const type = typeof action.type === "string" ? action.type : "";
|
|
173
|
-
if (type === "screenshot") {
|
|
178
|
+
for (const action of actions) switch (action.type) {
|
|
179
|
+
case "screenshot":
|
|
174
180
|
await flush();
|
|
175
181
|
result.readResults.push({
|
|
176
182
|
type: "screenshot",
|
|
177
183
|
...await this.screenshot()
|
|
178
184
|
});
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
if (type === "url") {
|
|
185
|
+
break;
|
|
186
|
+
case "url":
|
|
182
187
|
await flush();
|
|
183
188
|
result.readResults.push({
|
|
184
189
|
type: "url",
|
|
185
190
|
url: await this.currentUrl()
|
|
186
191
|
});
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
if (type === "cursor_position") {
|
|
192
|
+
break;
|
|
193
|
+
case "cursor_position":
|
|
190
194
|
await flush();
|
|
191
|
-
const pos = await this.currentMousePosition();
|
|
192
195
|
result.readResults.push({
|
|
193
196
|
type: "cursor_position",
|
|
194
|
-
...
|
|
197
|
+
...await this.currentMousePosition()
|
|
195
198
|
});
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
if (type === "goto") {
|
|
199
|
-
const url = normalizeGotoUrl(action.url) ?? "";
|
|
199
|
+
break;
|
|
200
|
+
case "goto":
|
|
200
201
|
pending.push(keypress(["Control", "l"]), {
|
|
201
202
|
type: "type_text",
|
|
202
|
-
type_text: { text: url }
|
|
203
|
+
type_text: { text: normalizeGotoUrl(action.url) ?? "" }
|
|
203
204
|
}, keypress(["Enter"]));
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
if (type === "back") {
|
|
205
|
+
break;
|
|
206
|
+
case "back":
|
|
207
207
|
pending.push(keypress(["Alt", "Left"]));
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
if (type === "forward") {
|
|
208
|
+
break;
|
|
209
|
+
case "forward":
|
|
211
210
|
pending.push(keypress(["Alt", "Right"]));
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
211
|
+
break;
|
|
212
|
+
default:
|
|
213
|
+
pending.push(this.toSdkAction(action));
|
|
214
|
+
break;
|
|
215
215
|
}
|
|
216
216
|
await flush();
|
|
217
217
|
return result;
|
|
218
218
|
}
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
};
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
case "double_click": {
|
|
239
|
-
const doubleClickHoldKeys = readHoldKeys(action.hold_keys);
|
|
240
|
-
const point = toViewportPoint(action, coordinateSystem, viewport);
|
|
241
|
-
return {
|
|
242
|
-
type: "click_mouse",
|
|
243
|
-
click_mouse: {
|
|
244
|
-
x: point.x,
|
|
245
|
-
y: point.y,
|
|
246
|
-
num_clicks: 2,
|
|
247
|
-
...doubleClickHoldKeys.length > 0 ? { hold_keys: doubleClickHoldKeys } : {}
|
|
248
|
-
}
|
|
249
|
-
};
|
|
250
|
-
}
|
|
251
|
-
case "mouse_down":
|
|
252
|
-
case "mouse_up": {
|
|
253
|
-
const mouseHoldKeys = readHoldKeys(action.hold_keys);
|
|
254
|
-
const point = toViewportPoint(action, coordinateSystem, viewport);
|
|
255
|
-
return {
|
|
256
|
-
type: "click_mouse",
|
|
257
|
-
click_mouse: {
|
|
258
|
-
x: point.x,
|
|
259
|
-
y: point.y,
|
|
260
|
-
button: clickMouseButtonOr(action.button, "left"),
|
|
261
|
-
click_type: type === "mouse_down" ? "down" : "up",
|
|
262
|
-
...mouseHoldKeys.length > 0 ? { hold_keys: mouseHoldKeys } : {}
|
|
263
|
-
}
|
|
264
|
-
};
|
|
219
|
+
toSdkAction(action) {
|
|
220
|
+
switch (action.type) {
|
|
221
|
+
case "click": return this.clickAction(action, { button: mouseButton(action.button) });
|
|
222
|
+
case "double_click": return this.clickAction(action, { num_clicks: 2 });
|
|
223
|
+
case "mouse_down": return this.clickAction(action, {
|
|
224
|
+
button: mouseButton(action.button),
|
|
225
|
+
click_type: "down"
|
|
226
|
+
});
|
|
227
|
+
case "mouse_up": return this.clickAction(action, {
|
|
228
|
+
button: mouseButton(action.button),
|
|
229
|
+
click_type: "up"
|
|
230
|
+
});
|
|
231
|
+
case "type": return typeText(action);
|
|
232
|
+
case "keypress": return keypress(action.keys, action.duration);
|
|
233
|
+
case "scroll": return this.scrollAction(action);
|
|
234
|
+
case "move": return this.moveAction(action);
|
|
235
|
+
case "drag": return this.dragAction(action);
|
|
236
|
+
case "wait": return waitAction(action);
|
|
237
|
+
default: return unreachable(action);
|
|
265
238
|
}
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
239
|
+
}
|
|
240
|
+
clickAction(action, extra) {
|
|
241
|
+
const point = this.toViewportPoint(action.x, action.y);
|
|
242
|
+
return {
|
|
243
|
+
type: "click_mouse",
|
|
244
|
+
click_mouse: {
|
|
245
|
+
x: point.x,
|
|
246
|
+
y: point.y,
|
|
247
|
+
...extra,
|
|
248
|
+
...holdKeys(action.hold_keys)
|
|
249
|
+
}
|
|
269
250
|
};
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
}
|
|
283
|
-
};
|
|
284
|
-
}
|
|
285
|
-
case "move": {
|
|
286
|
-
const moveHoldKeys = readHoldKeys(action.hold_keys);
|
|
287
|
-
const point = toViewportPoint(action, coordinateSystem, viewport);
|
|
288
|
-
return {
|
|
289
|
-
type: "move_mouse",
|
|
290
|
-
move_mouse: {
|
|
291
|
-
x: point.x,
|
|
292
|
-
y: point.y,
|
|
293
|
-
...moveHoldKeys.length > 0 ? { hold_keys: moveHoldKeys } : {}
|
|
294
|
-
}
|
|
295
|
-
};
|
|
296
|
-
}
|
|
297
|
-
case "drag": {
|
|
298
|
-
const dragHoldKeys = readHoldKeys(action.hold_keys);
|
|
299
|
-
return {
|
|
300
|
-
type: "drag_mouse",
|
|
301
|
-
drag_mouse: {
|
|
302
|
-
path: toPath(action.path, coordinateSystem, viewport),
|
|
303
|
-
button: dragMouseButtonOr(action.button, "left"),
|
|
304
|
-
...dragHoldKeys.length > 0 ? { hold_keys: dragHoldKeys } : {}
|
|
305
|
-
}
|
|
306
|
-
};
|
|
307
|
-
}
|
|
308
|
-
case "wait": return {
|
|
309
|
-
type: "sleep",
|
|
310
|
-
sleep: { duration_ms: typeof action.ms === "number" ? Math.trunc(action.ms) : 1e3 }
|
|
251
|
+
}
|
|
252
|
+
scrollAction(action) {
|
|
253
|
+
const point = this.toViewportPoint(action.x ?? 0, action.y ?? 0);
|
|
254
|
+
return {
|
|
255
|
+
type: "scroll",
|
|
256
|
+
scroll: {
|
|
257
|
+
x: point.x,
|
|
258
|
+
y: point.y,
|
|
259
|
+
delta_x: Math.trunc(action.scroll_x ?? 0),
|
|
260
|
+
delta_y: Math.trunc(action.scroll_y ?? 0),
|
|
261
|
+
...holdKeys(action.hold_keys)
|
|
262
|
+
}
|
|
311
263
|
};
|
|
312
|
-
default: throw new Error(`unknown computer action type: ${type}`);
|
|
313
264
|
}
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
265
|
+
moveAction(action) {
|
|
266
|
+
const point = this.toViewportPoint(action.x, action.y);
|
|
267
|
+
return {
|
|
268
|
+
type: "move_mouse",
|
|
269
|
+
move_mouse: {
|
|
270
|
+
x: point.x,
|
|
271
|
+
y: point.y
|
|
272
|
+
}
|
|
273
|
+
};
|
|
320
274
|
}
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
275
|
+
dragAction(action) {
|
|
276
|
+
return {
|
|
277
|
+
type: "drag_mouse",
|
|
278
|
+
drag_mouse: {
|
|
279
|
+
path: action.path.map((point) => {
|
|
280
|
+
const transformed = this.toViewportPoint(point.x, point.y);
|
|
281
|
+
return [transformed.x, transformed.y];
|
|
282
|
+
}),
|
|
283
|
+
button: dragButton(action.button),
|
|
284
|
+
...holdKeys(action.hold_keys)
|
|
285
|
+
}
|
|
286
|
+
};
|
|
287
|
+
}
|
|
288
|
+
toViewportPoint(x, y) {
|
|
289
|
+
if (this.coordinateSystem.type === "pixel") return {
|
|
290
|
+
x: Math.trunc(x),
|
|
291
|
+
y: Math.trunc(y)
|
|
292
|
+
};
|
|
293
|
+
const [min, max] = this.coordinateSystem.range;
|
|
294
|
+
const scale = max - min;
|
|
295
|
+
if (scale <= 0) return {
|
|
296
|
+
x: Math.trunc(x),
|
|
297
|
+
y: Math.trunc(y)
|
|
298
|
+
};
|
|
299
|
+
return {
|
|
300
|
+
x: clamp(Math.round((x - min) / scale * this.viewport.width), 0, this.viewport.width - 1),
|
|
301
|
+
y: clamp(Math.round((y - min) / scale * this.viewport.height), 0, this.viewport.height - 1)
|
|
302
|
+
};
|
|
303
|
+
}
|
|
304
|
+
async runKernelBatch(actions) {
|
|
305
|
+
await this.client.browsers.computer.batch(this.sessionId, { actions });
|
|
306
|
+
}
|
|
307
|
+
};
|
|
308
|
+
const PLAYWRIGHT_MAX_TIMEOUT_SEC = 300;
|
|
309
|
+
const CLICK_BUTTONS = new Set([
|
|
310
|
+
"left",
|
|
311
|
+
"right",
|
|
312
|
+
"middle",
|
|
313
|
+
"back",
|
|
314
|
+
"forward"
|
|
315
|
+
]);
|
|
316
|
+
const DRAG_BUTTONS = new Set([
|
|
317
|
+
"left",
|
|
318
|
+
"right",
|
|
319
|
+
"middle"
|
|
320
|
+
]);
|
|
321
|
+
function mouseButton(value) {
|
|
322
|
+
return value !== void 0 && CLICK_BUTTONS.has(value) ? value : "left";
|
|
325
323
|
}
|
|
326
|
-
function
|
|
327
|
-
|
|
328
|
-
if (candidate === "left" || candidate === "right" || candidate === "middle" || candidate === "back" || candidate === "forward") return candidate;
|
|
329
|
-
return fallback;
|
|
324
|
+
function dragButton(value) {
|
|
325
|
+
return value !== void 0 && DRAG_BUTTONS.has(value) ? value : "left";
|
|
330
326
|
}
|
|
331
|
-
function
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
327
|
+
function typeText(action) {
|
|
328
|
+
return {
|
|
329
|
+
type: "type_text",
|
|
330
|
+
type_text: { text: action.text }
|
|
331
|
+
};
|
|
335
332
|
}
|
|
336
|
-
function
|
|
337
|
-
return
|
|
333
|
+
function waitAction(action) {
|
|
334
|
+
return {
|
|
335
|
+
type: "sleep",
|
|
336
|
+
sleep: { duration_ms: Math.trunc(action.ms ?? 1e3) }
|
|
337
|
+
};
|
|
338
338
|
}
|
|
339
|
-
function
|
|
340
|
-
|
|
339
|
+
function holdKeys(keys) {
|
|
340
|
+
if (!keys || keys.length === 0) return {};
|
|
341
|
+
return { hold_keys: keys.map(normalizeKernelKey) };
|
|
341
342
|
}
|
|
342
|
-
function keypress(keys, duration
|
|
343
|
+
function keypress(keys, duration) {
|
|
343
344
|
const translated = keys.flatMap(normalizeKernelKeyCombo);
|
|
344
345
|
const pressedKeys = translated.filter((key) => !isKernelModifierKey(key));
|
|
345
|
-
const
|
|
346
|
+
const heldKeys = pressedKeys.length > 0 ? translated.filter(isKernelModifierKey) : translated.slice(0, -1);
|
|
346
347
|
return {
|
|
347
348
|
type: "press_key",
|
|
348
349
|
press_key: {
|
|
349
350
|
keys: pressedKeys.length > 0 ? pressedKeys : translated.slice(-1),
|
|
350
|
-
...
|
|
351
|
+
...heldKeys.length > 0 ? { hold_keys: heldKeys } : {},
|
|
351
352
|
...typeof duration === "number" && Number.isFinite(duration) && duration > 0 ? { duration: Math.trunc(duration) } : {}
|
|
352
353
|
}
|
|
353
354
|
};
|
|
354
355
|
}
|
|
355
|
-
function toPath(value, coordinateSystem = { type: "pixel" }, viewport = {
|
|
356
|
-
width: 1920,
|
|
357
|
-
height: 1080
|
|
358
|
-
}) {
|
|
359
|
-
if (!Array.isArray(value)) return [];
|
|
360
|
-
return value.map((point) => toPathPoint(point, coordinateSystem, viewport));
|
|
361
|
-
}
|
|
362
|
-
function toPathPoint(value, coordinateSystem, viewport) {
|
|
363
|
-
if (Array.isArray(value)) {
|
|
364
|
-
const point = transformPoint(toInt(value[0]), toInt(value[1]), coordinateSystem, viewport);
|
|
365
|
-
return [point.x, point.y];
|
|
366
|
-
}
|
|
367
|
-
if (value && typeof value === "object") {
|
|
368
|
-
const point = value;
|
|
369
|
-
const transformed = transformPoint(toInt(point.x), toInt(point.y), coordinateSystem, viewport);
|
|
370
|
-
return [transformed.x, transformed.y];
|
|
371
|
-
}
|
|
372
|
-
return [0, 0];
|
|
373
|
-
}
|
|
374
|
-
function toViewportPoint(action, coordinateSystem, viewport) {
|
|
375
|
-
return transformPoint(toInt(action.x), toInt(action.y), coordinateSystem, viewport);
|
|
376
|
-
}
|
|
377
|
-
function transformPoint(x, y, coordinateSystem, viewport) {
|
|
378
|
-
if (coordinateSystem.type === "pixel") return {
|
|
379
|
-
x,
|
|
380
|
-
y
|
|
381
|
-
};
|
|
382
|
-
const [min, max] = coordinateSystem.range;
|
|
383
|
-
const scale = max - min;
|
|
384
|
-
if (scale <= 0) return {
|
|
385
|
-
x,
|
|
386
|
-
y
|
|
387
|
-
};
|
|
388
|
-
return {
|
|
389
|
-
x: clamp(Math.round((x - min) / scale * viewport.width), 0, viewport.width - 1),
|
|
390
|
-
y: clamp(Math.round((y - min) / scale * viewport.height), 0, viewport.height - 1)
|
|
391
|
-
};
|
|
392
|
-
}
|
|
393
356
|
function clamp(value, min, max) {
|
|
394
357
|
return Math.max(min, Math.min(max, value));
|
|
395
358
|
}
|
|
359
|
+
function unreachable(action) {
|
|
360
|
+
throw new Error(`unknown computer action type: ${JSON.stringify(action)}`);
|
|
361
|
+
}
|
|
396
362
|
//#endregion
|
|
397
363
|
//#region src/tools.ts
|
|
398
364
|
function createCuaComputerTools(args) {
|
|
399
|
-
|
|
400
|
-
|
|
365
|
+
return buildCuaComputerTools(args, new InternalComputerTranslator(args));
|
|
366
|
+
}
|
|
367
|
+
/** Build executor tools against an existing translator (internal; not part of the package surface). */
|
|
368
|
+
function buildCuaComputerTools(args, translator) {
|
|
369
|
+
return withExtraTools(args).map((executor) => createExecutorTool(executor, translator));
|
|
401
370
|
}
|
|
402
|
-
function
|
|
371
|
+
function withExtraTools(args) {
|
|
403
372
|
const executors = [...args.toolExecutors];
|
|
404
373
|
const existing = new Set(executors.map((executor) => executor.definition.name));
|
|
405
|
-
if (args.computerUseExtra && !existing.has(CUA_NAVIGATION_TOOL_NAME)) {
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
374
|
+
if (args.computerUseExtra && !existing.has(CUA_NAVIGATION_TOOL_NAME)) executors.push({
|
|
375
|
+
kind: "navigation",
|
|
376
|
+
definition: createCuaNavigationToolDefinition()
|
|
377
|
+
});
|
|
378
|
+
if (args.playwright && !existing.has(CUA_PLAYWRIGHT_TOOL_NAME)) executors.push({
|
|
379
|
+
kind: "playwright",
|
|
380
|
+
definition: createCuaPlaywrightToolDefinition()
|
|
381
|
+
});
|
|
412
382
|
return executors;
|
|
413
383
|
}
|
|
414
384
|
function createExecutorTool(executor, translator) {
|
|
@@ -422,6 +392,16 @@ function createExecutorTool(executor, translator) {
|
|
|
422
392
|
return executeNavigationTool(translator, asNavigationInput(params));
|
|
423
393
|
}
|
|
424
394
|
};
|
|
395
|
+
if (isPlaywrightExecutor(executor)) return {
|
|
396
|
+
name: definition.name,
|
|
397
|
+
label: definition.name,
|
|
398
|
+
description: definition.description,
|
|
399
|
+
parameters: definition.parameters,
|
|
400
|
+
executionMode: "sequential",
|
|
401
|
+
async execute(_toolCallId, params) {
|
|
402
|
+
return executePlaywrightTool(translator, asPlaywrightInput(params));
|
|
403
|
+
}
|
|
404
|
+
};
|
|
425
405
|
return {
|
|
426
406
|
name: definition.name,
|
|
427
407
|
label: definition.name,
|
|
@@ -436,6 +416,9 @@ function createExecutorTool(executor, translator) {
|
|
|
436
416
|
function isNavigationExecutor(executor) {
|
|
437
417
|
return "kind" in executor && executor.kind === "navigation";
|
|
438
418
|
}
|
|
419
|
+
function isPlaywrightExecutor(executor) {
|
|
420
|
+
return "kind" in executor && executor.kind === "playwright";
|
|
421
|
+
}
|
|
439
422
|
async function executeBatchTool(translator, params) {
|
|
440
423
|
const content = [];
|
|
441
424
|
const readResults = [];
|
|
@@ -502,10 +485,11 @@ async function executeNavigationTool(translator, params) {
|
|
|
502
485
|
if (action === "url") {
|
|
503
486
|
url = await translator.currentUrl();
|
|
504
487
|
statusText = `Current URL: ${url}`;
|
|
505
|
-
} else await translator.executeBatch([{
|
|
506
|
-
type:
|
|
507
|
-
url: params.url
|
|
488
|
+
} else if (action === "goto") await translator.executeBatch([{
|
|
489
|
+
type: "goto",
|
|
490
|
+
url: params.url ?? ""
|
|
508
491
|
}]);
|
|
492
|
+
else await translator.executeBatch([{ type: action }]);
|
|
509
493
|
const screenshot = await translator.screenshot();
|
|
510
494
|
return {
|
|
511
495
|
content: [{
|
|
@@ -526,6 +510,50 @@ async function executeNavigationTool(translator, params) {
|
|
|
526
510
|
throw new Error(`${action} failed: ${errorMessage(err)}`, { cause: err });
|
|
527
511
|
}
|
|
528
512
|
}
|
|
513
|
+
async function executePlaywrightTool(translator, params) {
|
|
514
|
+
try {
|
|
515
|
+
const execution = await translator.executePlaywright(params.code, params.timeout_sec);
|
|
516
|
+
const content = [];
|
|
517
|
+
if (execution.result !== void 0) content.push({
|
|
518
|
+
type: "text",
|
|
519
|
+
text: `result: ${formatPlaywrightResult(execution.result)}`
|
|
520
|
+
});
|
|
521
|
+
if (execution.stdout?.trim()) content.push({
|
|
522
|
+
type: "text",
|
|
523
|
+
text: `stdout:\n${execution.stdout.trimEnd()}`
|
|
524
|
+
});
|
|
525
|
+
if (execution.stderr?.trim()) content.push({
|
|
526
|
+
type: "text",
|
|
527
|
+
text: `stderr:\n${execution.stderr.trimEnd()}`
|
|
528
|
+
});
|
|
529
|
+
if (!execution.success) content.push({
|
|
530
|
+
type: "text",
|
|
531
|
+
text: `error: ${execution.error ?? "playwright execution reported failure"}`
|
|
532
|
+
});
|
|
533
|
+
const statusText = execution.success ? "Playwright executed successfully." : `Playwright execution failed: ${execution.error ?? "unknown error"}`;
|
|
534
|
+
if (content.length === 0) content.push({
|
|
535
|
+
type: "text",
|
|
536
|
+
text: statusText
|
|
537
|
+
});
|
|
538
|
+
const details = {
|
|
539
|
+
success: execution.success,
|
|
540
|
+
statusText
|
|
541
|
+
};
|
|
542
|
+
if (execution.result !== void 0) details.result = execution.result;
|
|
543
|
+
if (execution.stdout) details.stdout = execution.stdout;
|
|
544
|
+
if (execution.stderr) details.stderr = execution.stderr;
|
|
545
|
+
if (execution.error) details.error = execution.error;
|
|
546
|
+
return {
|
|
547
|
+
content,
|
|
548
|
+
details
|
|
549
|
+
};
|
|
550
|
+
} catch (err) {
|
|
551
|
+
throw new Error(`playwright_execute failed: ${errorMessage(err)}`, { cause: err });
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
function formatPlaywrightResult(result) {
|
|
555
|
+
return typeof result === "string" ? result : JSON.stringify(result);
|
|
556
|
+
}
|
|
529
557
|
function errorMessage(err) {
|
|
530
558
|
return err instanceof Error ? err.message : String(err);
|
|
531
559
|
}
|
|
@@ -533,6 +561,10 @@ function asNavigationInput(value) {
|
|
|
533
561
|
if (value && typeof value === "object" && typeof value.action === "string") return value;
|
|
534
562
|
throw new Error("invalid computer_use_extra parameters");
|
|
535
563
|
}
|
|
564
|
+
function asPlaywrightInput(value) {
|
|
565
|
+
if (value && typeof value === "object" && typeof value.code === "string") return value;
|
|
566
|
+
throw new Error("invalid playwright_execute parameters");
|
|
567
|
+
}
|
|
536
568
|
//#endregion
|
|
537
569
|
//#region src/agent.ts
|
|
538
570
|
/**
|
|
@@ -561,14 +593,11 @@ var CuaRuntimeController = class {
|
|
|
561
593
|
this.translator = this.createTranslator();
|
|
562
594
|
}
|
|
563
595
|
tools() {
|
|
564
|
-
return [...
|
|
565
|
-
browser: this.options.browser,
|
|
566
|
-
client: this.options.client,
|
|
596
|
+
return [...buildCuaComputerTools({
|
|
567
597
|
toolExecutors: this.runtimeSpec.toolExecutors,
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
}), ...this.options.extraTools ?? []];
|
|
598
|
+
computerUseExtra: this.options.computerUseExtra,
|
|
599
|
+
playwright: this.options.playwright
|
|
600
|
+
}, this.translator), ...this.options.extraTools ?? []];
|
|
572
601
|
}
|
|
573
602
|
onPayload() {
|
|
574
603
|
const runtimeSpec = this.runtimeSpec;
|
|
@@ -578,7 +607,11 @@ var CuaRuntimeController = class {
|
|
|
578
607
|
}) : void 0, this.options.onPayload);
|
|
579
608
|
}
|
|
580
609
|
keepToolNames() {
|
|
581
|
-
return [
|
|
610
|
+
return [
|
|
611
|
+
...(this.options.extraTools ?? []).map((tool) => tool.name),
|
|
612
|
+
...this.options.computerUseExtra ? [CUA_NAVIGATION_TOOL_NAME] : [],
|
|
613
|
+
...this.options.playwright ? [CUA_PLAYWRIGHT_TOOL_NAME] : []
|
|
614
|
+
];
|
|
582
615
|
}
|
|
583
616
|
createTranslator() {
|
|
584
617
|
return new InternalComputerTranslator({
|
|
@@ -605,15 +638,18 @@ async function getCuaEnvApiKeyAndHeaders(model) {
|
|
|
605
638
|
var CuaAgent = class extends Agent {
|
|
606
639
|
runtime;
|
|
607
640
|
ownsSystemPrompt;
|
|
641
|
+
runtimeDirty = false;
|
|
608
642
|
stateProxy;
|
|
643
|
+
stateProxyTarget;
|
|
609
644
|
constructor(options) {
|
|
610
|
-
const { browser, client, initialState, onPayload, streamFn, prepareNextTurn, extraTools, computerUseExtra, ...agentOptions } = options;
|
|
645
|
+
const { browser, client, initialState, onPayload, streamFn, prepareNextTurn, extraTools, computerUseExtra, playwright, ...agentOptions } = options;
|
|
611
646
|
const runtime = new CuaRuntimeController({
|
|
612
647
|
browser,
|
|
613
648
|
client,
|
|
614
649
|
model: initialState.model,
|
|
615
650
|
extraTools,
|
|
616
651
|
computerUseExtra,
|
|
652
|
+
playwright,
|
|
617
653
|
onPayload
|
|
618
654
|
});
|
|
619
655
|
const wrappedStreamFn = (model, context, streamOptions) => {
|
|
@@ -638,13 +674,17 @@ var CuaAgent = class extends Agent {
|
|
|
638
674
|
this.runtime = runtime;
|
|
639
675
|
this.ownsSystemPrompt = initialState.systemPrompt === void 0;
|
|
640
676
|
/**
|
|
641
|
-
* pi
|
|
642
|
-
*
|
|
643
|
-
*
|
|
677
|
+
* pi's loop only re-reads model/tools/prompt between provider requests
|
|
678
|
+
* through `prepareNextTurn`. The wrapper stays pass-through (returning
|
|
679
|
+
* `undefined`, i.e. stock pi behavior) until either the user hook returns
|
|
680
|
+
* an update or a mid-run model assignment marks the CUA runtime dirty —
|
|
681
|
+
* only then is a turn update built from current state.
|
|
644
682
|
*/
|
|
645
683
|
this.prepareNextTurn = async (signal) => {
|
|
646
684
|
const update = await prepareNextTurn?.(signal);
|
|
647
685
|
if (update?.model) this.applyRuntime(update.model);
|
|
686
|
+
if (!update && !this.runtimeDirty) return void 0;
|
|
687
|
+
this.runtimeDirty = false;
|
|
648
688
|
const state = super.state;
|
|
649
689
|
const context = update?.context ?? {
|
|
650
690
|
systemPrompt: state.systemPrompt,
|
|
@@ -668,17 +708,22 @@ var CuaAgent = class extends Agent {
|
|
|
668
708
|
* and payload hooks for the selected provider.
|
|
669
709
|
*/
|
|
670
710
|
get state() {
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
711
|
+
const target = super.state;
|
|
712
|
+
if (!this.stateProxy || this.stateProxyTarget !== target) {
|
|
713
|
+
this.stateProxyTarget = target;
|
|
714
|
+
this.stateProxy = new Proxy(target, { set: (proxied, prop, value, receiver) => {
|
|
715
|
+
if (prop === "model") {
|
|
716
|
+
this.applyRuntime(value);
|
|
717
|
+
return true;
|
|
718
|
+
}
|
|
719
|
+
return Reflect.set(proxied, prop, value, receiver);
|
|
720
|
+
} });
|
|
721
|
+
}
|
|
678
722
|
return this.stateProxy;
|
|
679
723
|
}
|
|
680
724
|
applyRuntime(model) {
|
|
681
725
|
this.runtime.setModel(model);
|
|
726
|
+
this.runtimeDirty = true;
|
|
682
727
|
const state = super.state;
|
|
683
728
|
state.model = this.runtime.model;
|
|
684
729
|
state.tools = this.runtime.tools();
|
|
@@ -697,13 +742,14 @@ var CuaAgentHarness = class extends AgentHarness {
|
|
|
697
742
|
runtime;
|
|
698
743
|
requestedActiveToolNames;
|
|
699
744
|
constructor(options) {
|
|
700
|
-
const { browser, client, model, extraTools, computerUseExtra, systemPrompt, getApiKeyAndHeaders, onPayload, activeToolNames, ...harnessOptions } = options;
|
|
745
|
+
const { browser, client, model, extraTools, computerUseExtra, playwright, systemPrompt, getApiKeyAndHeaders, onPayload, activeToolNames, ...harnessOptions } = options;
|
|
701
746
|
const runtime = new CuaRuntimeController({
|
|
702
747
|
browser,
|
|
703
748
|
client,
|
|
704
749
|
model,
|
|
705
750
|
extraTools,
|
|
706
751
|
computerUseExtra,
|
|
752
|
+
playwright,
|
|
707
753
|
onPayload
|
|
708
754
|
});
|
|
709
755
|
const resolvedTools = runtime.tools();
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@onkernel/cua-agent",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.4",
|
|
4
4
|
"description": "Kernel browser computer-use Agent and AgentHarness classes built on pi-agent-core",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
@@ -42,7 +42,7 @@
|
|
|
42
42
|
"dependencies": {
|
|
43
43
|
"@earendil-works/pi-agent-core": "0.79.1",
|
|
44
44
|
"@earendil-works/pi-ai": "0.79.1",
|
|
45
|
-
"@onkernel/cua-ai": "0.
|
|
45
|
+
"@onkernel/cua-ai": "0.3.1",
|
|
46
46
|
"@onkernel/sdk": "0.49.0",
|
|
47
47
|
"sharp": "^0.34.5"
|
|
48
48
|
},
|