@onkernel/cua-agent 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +16 -0
- package/README.md +30 -18
- package/dist/index.d.ts +2 -0
- package/dist/index.js +170 -202
- package/package.json +2 -2
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,21 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.3.3 - 2026-06-12
|
|
4
|
+
|
|
5
|
+
- The action translator now consumes the canonical `CuaAction` union with an
|
|
6
|
+
exhaustive switch. Malformed action shapes fail loudly instead of silently
|
|
7
|
+
coercing (previously e.g. a click at 0,0); the documented mouse-button
|
|
8
|
+
coercion to `"left"` is unchanged.
|
|
9
|
+
- `prepareNextTurn` no longer rebuilds the turn context on every turn: it
|
|
10
|
+
keeps stock pi behavior until a user hook returns an update or a mid-run
|
|
11
|
+
model assignment requires a refresh.
|
|
12
|
+
- One translator instance per runtime is shared between the executor tools
|
|
13
|
+
and the provider screenshot capability.
|
|
14
|
+
- The `CuaAgentHarness` README quickstart showcases session-backed turns and
|
|
15
|
+
mid-session model switching; `computerUseExtra` is documented with its
|
|
16
|
+
rationale.
|
|
17
|
+
- Update the `@onkernel/cua-ai` dependency to 0.3.0.
|
|
18
|
+
|
|
3
19
|
## 0.3.2 - 2026-06-11
|
|
4
20
|
|
|
5
21
|
- Update the `@onkernel/cua-ai` dependency to 0.2.2.
|
package/README.md
CHANGED
|
@@ -37,11 +37,17 @@ await agent.prompt("Open news.ycombinator.com and summarize the top story.");
|
|
|
37
37
|
|
|
38
38
|
## Quick Start (`CuaAgentHarness`)
|
|
39
39
|
|
|
40
|
+
`prompt()` returns the turn's final assistant message, and every turn is
|
|
41
|
+
persisted to the session — later prompts see the full transcript. Runtime
|
|
42
|
+
config like the model can change between turns (or even mid-turn, applying at
|
|
43
|
+
the next provider request):
|
|
44
|
+
|
|
40
45
|
```ts
|
|
41
46
|
import { CuaAgentHarness, InMemorySessionRepo, NodeExecutionEnv } from "@onkernel/cua-agent";
|
|
47
|
+
import type { AssistantMessage } from "@onkernel/cua-ai";
|
|
42
48
|
|
|
43
49
|
const sessionRepo = new InMemorySessionRepo();
|
|
44
|
-
const session = await sessionRepo.create({ id: "
|
|
50
|
+
const session = await sessionRepo.create({ id: "research" });
|
|
45
51
|
|
|
46
52
|
const harness = new CuaAgentHarness({
|
|
47
53
|
browser,
|
|
@@ -51,22 +57,26 @@ const harness = new CuaAgentHarness({
|
|
|
51
57
|
session,
|
|
52
58
|
});
|
|
53
59
|
|
|
54
|
-
const
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
console.log(
|
|
67
|
-
console.log("assistant text:", assistantText || "(no text)");
|
|
60
|
+
const textOf = (message: AssistantMessage) =>
|
|
61
|
+
message.content.flatMap((block) => (block.type === "text" ? [block.text] : [])).join("").trim();
|
|
62
|
+
|
|
63
|
+
// Turn 1: a session-backed prompt.
|
|
64
|
+
const first = await harness.prompt("Open example.com and describe what you see.");
|
|
65
|
+
console.log(textOf(first));
|
|
66
|
+
|
|
67
|
+
// Swap providers mid-session; CUA tools and the default prompt refresh to match.
|
|
68
|
+
await harness.setModel("anthropic:claude-opus-4-7");
|
|
69
|
+
|
|
70
|
+
// Turn 2 continues the same transcript on the new model.
|
|
71
|
+
const second = await harness.prompt("Open the most relevant link from what you found.");
|
|
72
|
+
console.log(textOf(second));
|
|
68
73
|
```
|
|
69
74
|
|
|
75
|
+
While a turn is running, `steer()` injects course corrections, `followUp()`
|
|
76
|
+
queues the next instruction, and `subscribe()` streams the underlying agent
|
|
77
|
+
events. `compact()` and session branching are available for long-running
|
|
78
|
+
transcripts — see the pi-agent-core docs for the full harness lifecycle.
|
|
79
|
+
|
|
70
80
|
Use `CuaAgent` when you want direct pi `Agent` control: raw message state,
|
|
71
81
|
lifecycle events, custom streaming, and explicit prompt/continue/queue control.
|
|
72
82
|
Reach for the harness shape when you want an app layer around the loop:
|
|
@@ -108,9 +118,11 @@ computer-use tools. This is useful when the model needs to call
|
|
|
108
118
|
application-specific code, such as looking up a record, writing a database row,
|
|
109
119
|
or handing off to another service while it also controls the browser.
|
|
110
120
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
`
|
|
121
|
+
Not every provider's native computer-use vocabulary includes browser
|
|
122
|
+
navigation — some models can click and type but have no direct way to open a
|
|
123
|
+
URL or go back. `computerUseExtra: true` adds `computer_use_extra`, a
|
|
124
|
+
provider-neutral escape hatch exposing `goto`, `back`, `forward`, and `url`
|
|
125
|
+
so navigation works uniformly regardless of which model is driving.
|
|
114
126
|
|
|
115
127
|
### Model Switching
|
|
116
128
|
|
package/dist/index.d.ts
CHANGED
|
@@ -102,7 +102,9 @@ type CuaAgentHarnessOptions<TSkill extends Skill = Skill, TPromptTemplate extend
|
|
|
102
102
|
declare class CuaAgent extends Agent {
|
|
103
103
|
private readonly runtime;
|
|
104
104
|
private readonly ownsSystemPrompt;
|
|
105
|
+
private runtimeDirty;
|
|
105
106
|
private stateProxy?;
|
|
107
|
+
private stateProxyTarget?;
|
|
106
108
|
constructor(options: CuaAgentOptions);
|
|
107
109
|
/**
|
|
108
110
|
* Return a state proxy so `agent.state.model = "provider:model"` can behave
|
package/dist/index.js
CHANGED
|
@@ -156,8 +156,8 @@ var InternalComputerTranslator = class {
|
|
|
156
156
|
async currentMousePosition() {
|
|
157
157
|
const pos = await this.client.browsers.computer.getMousePosition(this.sessionId);
|
|
158
158
|
return {
|
|
159
|
-
x:
|
|
160
|
-
y:
|
|
159
|
+
x: Math.trunc(pos.x),
|
|
160
|
+
y: Math.trunc(pos.y)
|
|
161
161
|
};
|
|
162
162
|
}
|
|
163
163
|
async executeBatch(actions) {
|
|
@@ -167,236 +167,196 @@ var InternalComputerTranslator = class {
|
|
|
167
167
|
if (pending.length === 0) return;
|
|
168
168
|
await this.runKernelBatch(pending.splice(0));
|
|
169
169
|
};
|
|
170
|
-
for (
|
|
171
|
-
|
|
172
|
-
const type = typeof action.type === "string" ? action.type : "";
|
|
173
|
-
if (type === "screenshot") {
|
|
170
|
+
for (const action of actions) switch (action.type) {
|
|
171
|
+
case "screenshot":
|
|
174
172
|
await flush();
|
|
175
173
|
result.readResults.push({
|
|
176
174
|
type: "screenshot",
|
|
177
175
|
...await this.screenshot()
|
|
178
176
|
});
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
if (type === "url") {
|
|
177
|
+
break;
|
|
178
|
+
case "url":
|
|
182
179
|
await flush();
|
|
183
180
|
result.readResults.push({
|
|
184
181
|
type: "url",
|
|
185
182
|
url: await this.currentUrl()
|
|
186
183
|
});
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
if (type === "cursor_position") {
|
|
184
|
+
break;
|
|
185
|
+
case "cursor_position":
|
|
190
186
|
await flush();
|
|
191
|
-
const pos = await this.currentMousePosition();
|
|
192
187
|
result.readResults.push({
|
|
193
188
|
type: "cursor_position",
|
|
194
|
-
...
|
|
189
|
+
...await this.currentMousePosition()
|
|
195
190
|
});
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
if (type === "goto") {
|
|
199
|
-
const url = normalizeGotoUrl(action.url) ?? "";
|
|
191
|
+
break;
|
|
192
|
+
case "goto":
|
|
200
193
|
pending.push(keypress(["Control", "l"]), {
|
|
201
194
|
type: "type_text",
|
|
202
|
-
type_text: { text: url }
|
|
195
|
+
type_text: { text: normalizeGotoUrl(action.url) ?? "" }
|
|
203
196
|
}, keypress(["Enter"]));
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
if (type === "back") {
|
|
197
|
+
break;
|
|
198
|
+
case "back":
|
|
207
199
|
pending.push(keypress(["Alt", "Left"]));
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
if (type === "forward") {
|
|
200
|
+
break;
|
|
201
|
+
case "forward":
|
|
211
202
|
pending.push(keypress(["Alt", "Right"]));
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
203
|
+
break;
|
|
204
|
+
default:
|
|
205
|
+
pending.push(this.toSdkAction(action));
|
|
206
|
+
break;
|
|
215
207
|
}
|
|
216
208
|
await flush();
|
|
217
209
|
return result;
|
|
218
210
|
}
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
};
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
case "double_click": {
|
|
239
|
-
const doubleClickHoldKeys = readHoldKeys(action.hold_keys);
|
|
240
|
-
const point = toViewportPoint(action, coordinateSystem, viewport);
|
|
241
|
-
return {
|
|
242
|
-
type: "click_mouse",
|
|
243
|
-
click_mouse: {
|
|
244
|
-
x: point.x,
|
|
245
|
-
y: point.y,
|
|
246
|
-
num_clicks: 2,
|
|
247
|
-
...doubleClickHoldKeys.length > 0 ? { hold_keys: doubleClickHoldKeys } : {}
|
|
248
|
-
}
|
|
249
|
-
};
|
|
250
|
-
}
|
|
251
|
-
case "mouse_down":
|
|
252
|
-
case "mouse_up": {
|
|
253
|
-
const mouseHoldKeys = readHoldKeys(action.hold_keys);
|
|
254
|
-
const point = toViewportPoint(action, coordinateSystem, viewport);
|
|
255
|
-
return {
|
|
256
|
-
type: "click_mouse",
|
|
257
|
-
click_mouse: {
|
|
258
|
-
x: point.x,
|
|
259
|
-
y: point.y,
|
|
260
|
-
button: clickMouseButtonOr(action.button, "left"),
|
|
261
|
-
click_type: type === "mouse_down" ? "down" : "up",
|
|
262
|
-
...mouseHoldKeys.length > 0 ? { hold_keys: mouseHoldKeys } : {}
|
|
263
|
-
}
|
|
264
|
-
};
|
|
211
|
+
toSdkAction(action) {
|
|
212
|
+
switch (action.type) {
|
|
213
|
+
case "click": return this.clickAction(action, { button: mouseButton(action.button) });
|
|
214
|
+
case "double_click": return this.clickAction(action, { num_clicks: 2 });
|
|
215
|
+
case "mouse_down": return this.clickAction(action, {
|
|
216
|
+
button: mouseButton(action.button),
|
|
217
|
+
click_type: "down"
|
|
218
|
+
});
|
|
219
|
+
case "mouse_up": return this.clickAction(action, {
|
|
220
|
+
button: mouseButton(action.button),
|
|
221
|
+
click_type: "up"
|
|
222
|
+
});
|
|
223
|
+
case "type": return typeText(action);
|
|
224
|
+
case "keypress": return keypress(action.keys, action.duration);
|
|
225
|
+
case "scroll": return this.scrollAction(action);
|
|
226
|
+
case "move": return this.moveAction(action);
|
|
227
|
+
case "drag": return this.dragAction(action);
|
|
228
|
+
case "wait": return waitAction(action);
|
|
229
|
+
default: return unreachable(action);
|
|
265
230
|
}
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
231
|
+
}
|
|
232
|
+
clickAction(action, extra) {
|
|
233
|
+
const point = this.toViewportPoint(action.x, action.y);
|
|
234
|
+
return {
|
|
235
|
+
type: "click_mouse",
|
|
236
|
+
click_mouse: {
|
|
237
|
+
x: point.x,
|
|
238
|
+
y: point.y,
|
|
239
|
+
...extra,
|
|
240
|
+
...holdKeys(action.hold_keys)
|
|
241
|
+
}
|
|
269
242
|
};
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
}
|
|
283
|
-
};
|
|
284
|
-
}
|
|
285
|
-
case "move": {
|
|
286
|
-
const moveHoldKeys = readHoldKeys(action.hold_keys);
|
|
287
|
-
const point = toViewportPoint(action, coordinateSystem, viewport);
|
|
288
|
-
return {
|
|
289
|
-
type: "move_mouse",
|
|
290
|
-
move_mouse: {
|
|
291
|
-
x: point.x,
|
|
292
|
-
y: point.y,
|
|
293
|
-
...moveHoldKeys.length > 0 ? { hold_keys: moveHoldKeys } : {}
|
|
294
|
-
}
|
|
295
|
-
};
|
|
296
|
-
}
|
|
297
|
-
case "drag": {
|
|
298
|
-
const dragHoldKeys = readHoldKeys(action.hold_keys);
|
|
299
|
-
return {
|
|
300
|
-
type: "drag_mouse",
|
|
301
|
-
drag_mouse: {
|
|
302
|
-
path: toPath(action.path, coordinateSystem, viewport),
|
|
303
|
-
button: dragMouseButtonOr(action.button, "left"),
|
|
304
|
-
...dragHoldKeys.length > 0 ? { hold_keys: dragHoldKeys } : {}
|
|
305
|
-
}
|
|
306
|
-
};
|
|
307
|
-
}
|
|
308
|
-
case "wait": return {
|
|
309
|
-
type: "sleep",
|
|
310
|
-
sleep: { duration_ms: typeof action.ms === "number" ? Math.trunc(action.ms) : 1e3 }
|
|
243
|
+
}
|
|
244
|
+
scrollAction(action) {
|
|
245
|
+
const point = this.toViewportPoint(action.x ?? 0, action.y ?? 0);
|
|
246
|
+
return {
|
|
247
|
+
type: "scroll",
|
|
248
|
+
scroll: {
|
|
249
|
+
x: point.x,
|
|
250
|
+
y: point.y,
|
|
251
|
+
delta_x: Math.trunc(action.scroll_x ?? 0),
|
|
252
|
+
delta_y: Math.trunc(action.scroll_y ?? 0),
|
|
253
|
+
...holdKeys(action.hold_keys)
|
|
254
|
+
}
|
|
311
255
|
};
|
|
312
|
-
default: throw new Error(`unknown computer action type: ${type}`);
|
|
313
256
|
}
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
257
|
+
moveAction(action) {
|
|
258
|
+
const point = this.toViewportPoint(action.x, action.y);
|
|
259
|
+
return {
|
|
260
|
+
type: "move_mouse",
|
|
261
|
+
move_mouse: {
|
|
262
|
+
x: point.x,
|
|
263
|
+
y: point.y
|
|
264
|
+
}
|
|
265
|
+
};
|
|
320
266
|
}
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
267
|
+
dragAction(action) {
|
|
268
|
+
return {
|
|
269
|
+
type: "drag_mouse",
|
|
270
|
+
drag_mouse: {
|
|
271
|
+
path: action.path.map((point) => {
|
|
272
|
+
const transformed = this.toViewportPoint(point.x, point.y);
|
|
273
|
+
return [transformed.x, transformed.y];
|
|
274
|
+
}),
|
|
275
|
+
button: dragButton(action.button),
|
|
276
|
+
...holdKeys(action.hold_keys)
|
|
277
|
+
}
|
|
278
|
+
};
|
|
279
|
+
}
|
|
280
|
+
toViewportPoint(x, y) {
|
|
281
|
+
if (this.coordinateSystem.type === "pixel") return {
|
|
282
|
+
x: Math.trunc(x),
|
|
283
|
+
y: Math.trunc(y)
|
|
284
|
+
};
|
|
285
|
+
const [min, max] = this.coordinateSystem.range;
|
|
286
|
+
const scale = max - min;
|
|
287
|
+
if (scale <= 0) return {
|
|
288
|
+
x: Math.trunc(x),
|
|
289
|
+
y: Math.trunc(y)
|
|
290
|
+
};
|
|
291
|
+
return {
|
|
292
|
+
x: clamp(Math.round((x - min) / scale * this.viewport.width), 0, this.viewport.width - 1),
|
|
293
|
+
y: clamp(Math.round((y - min) / scale * this.viewport.height), 0, this.viewport.height - 1)
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
async runKernelBatch(actions) {
|
|
297
|
+
await this.client.browsers.computer.batch(this.sessionId, { actions });
|
|
298
|
+
}
|
|
299
|
+
};
|
|
300
|
+
const CLICK_BUTTONS = new Set([
|
|
301
|
+
"left",
|
|
302
|
+
"right",
|
|
303
|
+
"middle",
|
|
304
|
+
"back",
|
|
305
|
+
"forward"
|
|
306
|
+
]);
|
|
307
|
+
const DRAG_BUTTONS = new Set([
|
|
308
|
+
"left",
|
|
309
|
+
"right",
|
|
310
|
+
"middle"
|
|
311
|
+
]);
|
|
312
|
+
function mouseButton(value) {
|
|
313
|
+
return value !== void 0 && CLICK_BUTTONS.has(value) ? value : "left";
|
|
325
314
|
}
|
|
326
|
-
function
|
|
327
|
-
|
|
328
|
-
if (candidate === "left" || candidate === "right" || candidate === "middle" || candidate === "back" || candidate === "forward") return candidate;
|
|
329
|
-
return fallback;
|
|
315
|
+
function dragButton(value) {
|
|
316
|
+
return value !== void 0 && DRAG_BUTTONS.has(value) ? value : "left";
|
|
330
317
|
}
|
|
331
|
-
function
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
318
|
+
function typeText(action) {
|
|
319
|
+
return {
|
|
320
|
+
type: "type_text",
|
|
321
|
+
type_text: { text: action.text }
|
|
322
|
+
};
|
|
335
323
|
}
|
|
336
|
-
function
|
|
337
|
-
return
|
|
324
|
+
function waitAction(action) {
|
|
325
|
+
return {
|
|
326
|
+
type: "sleep",
|
|
327
|
+
sleep: { duration_ms: Math.trunc(action.ms ?? 1e3) }
|
|
328
|
+
};
|
|
338
329
|
}
|
|
339
|
-
function
|
|
340
|
-
|
|
330
|
+
function holdKeys(keys) {
|
|
331
|
+
if (!keys || keys.length === 0) return {};
|
|
332
|
+
return { hold_keys: keys.map(normalizeKernelKey) };
|
|
341
333
|
}
|
|
342
|
-
function keypress(keys, duration
|
|
334
|
+
function keypress(keys, duration) {
|
|
343
335
|
const translated = keys.flatMap(normalizeKernelKeyCombo);
|
|
344
336
|
const pressedKeys = translated.filter((key) => !isKernelModifierKey(key));
|
|
345
|
-
const
|
|
337
|
+
const heldKeys = pressedKeys.length > 0 ? translated.filter(isKernelModifierKey) : translated.slice(0, -1);
|
|
346
338
|
return {
|
|
347
339
|
type: "press_key",
|
|
348
340
|
press_key: {
|
|
349
341
|
keys: pressedKeys.length > 0 ? pressedKeys : translated.slice(-1),
|
|
350
|
-
...
|
|
342
|
+
...heldKeys.length > 0 ? { hold_keys: heldKeys } : {},
|
|
351
343
|
...typeof duration === "number" && Number.isFinite(duration) && duration > 0 ? { duration: Math.trunc(duration) } : {}
|
|
352
344
|
}
|
|
353
345
|
};
|
|
354
346
|
}
|
|
355
|
-
function toPath(value, coordinateSystem = { type: "pixel" }, viewport = {
|
|
356
|
-
width: 1920,
|
|
357
|
-
height: 1080
|
|
358
|
-
}) {
|
|
359
|
-
if (!Array.isArray(value)) return [];
|
|
360
|
-
return value.map((point) => toPathPoint(point, coordinateSystem, viewport));
|
|
361
|
-
}
|
|
362
|
-
function toPathPoint(value, coordinateSystem, viewport) {
|
|
363
|
-
if (Array.isArray(value)) {
|
|
364
|
-
const point = transformPoint(toInt(value[0]), toInt(value[1]), coordinateSystem, viewport);
|
|
365
|
-
return [point.x, point.y];
|
|
366
|
-
}
|
|
367
|
-
if (value && typeof value === "object") {
|
|
368
|
-
const point = value;
|
|
369
|
-
const transformed = transformPoint(toInt(point.x), toInt(point.y), coordinateSystem, viewport);
|
|
370
|
-
return [transformed.x, transformed.y];
|
|
371
|
-
}
|
|
372
|
-
return [0, 0];
|
|
373
|
-
}
|
|
374
|
-
function toViewportPoint(action, coordinateSystem, viewport) {
|
|
375
|
-
return transformPoint(toInt(action.x), toInt(action.y), coordinateSystem, viewport);
|
|
376
|
-
}
|
|
377
|
-
function transformPoint(x, y, coordinateSystem, viewport) {
|
|
378
|
-
if (coordinateSystem.type === "pixel") return {
|
|
379
|
-
x,
|
|
380
|
-
y
|
|
381
|
-
};
|
|
382
|
-
const [min, max] = coordinateSystem.range;
|
|
383
|
-
const scale = max - min;
|
|
384
|
-
if (scale <= 0) return {
|
|
385
|
-
x,
|
|
386
|
-
y
|
|
387
|
-
};
|
|
388
|
-
return {
|
|
389
|
-
x: clamp(Math.round((x - min) / scale * viewport.width), 0, viewport.width - 1),
|
|
390
|
-
y: clamp(Math.round((y - min) / scale * viewport.height), 0, viewport.height - 1)
|
|
391
|
-
};
|
|
392
|
-
}
|
|
393
347
|
function clamp(value, min, max) {
|
|
394
348
|
return Math.max(min, Math.min(max, value));
|
|
395
349
|
}
|
|
350
|
+
function unreachable(action) {
|
|
351
|
+
throw new Error(`unknown computer action type: ${JSON.stringify(action)}`);
|
|
352
|
+
}
|
|
396
353
|
//#endregion
|
|
397
354
|
//#region src/tools.ts
|
|
398
355
|
function createCuaComputerTools(args) {
|
|
399
|
-
|
|
356
|
+
return buildCuaComputerTools(args, new InternalComputerTranslator(args));
|
|
357
|
+
}
|
|
358
|
+
/** Build executor tools against an existing translator (internal; not part of the package surface). */
|
|
359
|
+
function buildCuaComputerTools(args, translator) {
|
|
400
360
|
return withNavigationTool(args).map((executor) => createExecutorTool(executor, translator));
|
|
401
361
|
}
|
|
402
362
|
function withNavigationTool(args) {
|
|
@@ -502,10 +462,11 @@ async function executeNavigationTool(translator, params) {
|
|
|
502
462
|
if (action === "url") {
|
|
503
463
|
url = await translator.currentUrl();
|
|
504
464
|
statusText = `Current URL: ${url}`;
|
|
505
|
-
} else await translator.executeBatch([{
|
|
506
|
-
type:
|
|
507
|
-
url: params.url
|
|
465
|
+
} else if (action === "goto") await translator.executeBatch([{
|
|
466
|
+
type: "goto",
|
|
467
|
+
url: params.url ?? ""
|
|
508
468
|
}]);
|
|
469
|
+
else await translator.executeBatch([{ type: action }]);
|
|
509
470
|
const screenshot = await translator.screenshot();
|
|
510
471
|
return {
|
|
511
472
|
content: [{
|
|
@@ -561,14 +522,10 @@ var CuaRuntimeController = class {
|
|
|
561
522
|
this.translator = this.createTranslator();
|
|
562
523
|
}
|
|
563
524
|
tools() {
|
|
564
|
-
return [...
|
|
565
|
-
browser: this.options.browser,
|
|
566
|
-
client: this.options.client,
|
|
525
|
+
return [...buildCuaComputerTools({
|
|
567
526
|
toolExecutors: this.runtimeSpec.toolExecutors,
|
|
568
|
-
coordinateSystem: this.runtimeSpec.coordinateSystem,
|
|
569
|
-
screenshot: this.runtimeSpec.screenshot,
|
|
570
527
|
computerUseExtra: this.options.computerUseExtra
|
|
571
|
-
}), ...this.options.extraTools ?? []];
|
|
528
|
+
}, this.translator), ...this.options.extraTools ?? []];
|
|
572
529
|
}
|
|
573
530
|
onPayload() {
|
|
574
531
|
const runtimeSpec = this.runtimeSpec;
|
|
@@ -605,7 +562,9 @@ async function getCuaEnvApiKeyAndHeaders(model) {
|
|
|
605
562
|
var CuaAgent = class extends Agent {
|
|
606
563
|
runtime;
|
|
607
564
|
ownsSystemPrompt;
|
|
565
|
+
runtimeDirty = false;
|
|
608
566
|
stateProxy;
|
|
567
|
+
stateProxyTarget;
|
|
609
568
|
constructor(options) {
|
|
610
569
|
const { browser, client, initialState, onPayload, streamFn, prepareNextTurn, extraTools, computerUseExtra, ...agentOptions } = options;
|
|
611
570
|
const runtime = new CuaRuntimeController({
|
|
@@ -638,13 +597,17 @@ var CuaAgent = class extends Agent {
|
|
|
638
597
|
this.runtime = runtime;
|
|
639
598
|
this.ownsSystemPrompt = initialState.systemPrompt === void 0;
|
|
640
599
|
/**
|
|
641
|
-
* pi
|
|
642
|
-
*
|
|
643
|
-
*
|
|
600
|
+
* pi's loop only re-reads model/tools/prompt between provider requests
|
|
601
|
+
* through `prepareNextTurn`. The wrapper stays pass-through (returning
|
|
602
|
+
* `undefined`, i.e. stock pi behavior) until either the user hook returns
|
|
603
|
+
* an update or a mid-run model assignment marks the CUA runtime dirty —
|
|
604
|
+
* only then is a turn update built from current state.
|
|
644
605
|
*/
|
|
645
606
|
this.prepareNextTurn = async (signal) => {
|
|
646
607
|
const update = await prepareNextTurn?.(signal);
|
|
647
608
|
if (update?.model) this.applyRuntime(update.model);
|
|
609
|
+
if (!update && !this.runtimeDirty) return void 0;
|
|
610
|
+
this.runtimeDirty = false;
|
|
648
611
|
const state = super.state;
|
|
649
612
|
const context = update?.context ?? {
|
|
650
613
|
systemPrompt: state.systemPrompt,
|
|
@@ -668,17 +631,22 @@ var CuaAgent = class extends Agent {
|
|
|
668
631
|
* and payload hooks for the selected provider.
|
|
669
632
|
*/
|
|
670
633
|
get state() {
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
634
|
+
const target = super.state;
|
|
635
|
+
if (!this.stateProxy || this.stateProxyTarget !== target) {
|
|
636
|
+
this.stateProxyTarget = target;
|
|
637
|
+
this.stateProxy = new Proxy(target, { set: (proxied, prop, value, receiver) => {
|
|
638
|
+
if (prop === "model") {
|
|
639
|
+
this.applyRuntime(value);
|
|
640
|
+
return true;
|
|
641
|
+
}
|
|
642
|
+
return Reflect.set(proxied, prop, value, receiver);
|
|
643
|
+
} });
|
|
644
|
+
}
|
|
678
645
|
return this.stateProxy;
|
|
679
646
|
}
|
|
680
647
|
applyRuntime(model) {
|
|
681
648
|
this.runtime.setModel(model);
|
|
649
|
+
this.runtimeDirty = true;
|
|
682
650
|
const state = super.state;
|
|
683
651
|
state.model = this.runtime.model;
|
|
684
652
|
state.tools = this.runtime.tools();
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@onkernel/cua-agent",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.3",
|
|
4
4
|
"description": "Kernel browser computer-use Agent and AgentHarness classes built on pi-agent-core",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
@@ -42,7 +42,7 @@
|
|
|
42
42
|
"dependencies": {
|
|
43
43
|
"@earendil-works/pi-agent-core": "0.79.1",
|
|
44
44
|
"@earendil-works/pi-ai": "0.79.1",
|
|
45
|
-
"@onkernel/cua-ai": "0.
|
|
45
|
+
"@onkernel/cua-ai": "0.3.0",
|
|
46
46
|
"@onkernel/sdk": "0.49.0",
|
|
47
47
|
"sharp": "^0.34.5"
|
|
48
48
|
},
|