@empiricalrun/test-gen 0.53.3 → 0.53.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -0
- package/dist/agent/cua/computer.d.ts +4 -0
- package/dist/agent/cua/computer.d.ts.map +1 -1
- package/dist/agent/cua/computer.js +9 -1
- package/dist/agent/cua/index.d.ts.map +1 -1
- package/dist/agent/cua/index.js +60 -30
- package/dist/agent/cua/model.d.ts +2 -2
- package/dist/agent/cua/model.d.ts.map +1 -1
- package/dist/agent/cua/model.js +19 -0
- package/package.json +2 -2
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,14 @@
|
|
|
1
1
|
# @empiricalrun/test-gen
|
|
2
2
|
|
|
3
|
+
## 0.53.4
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- 1426372: fix: remove stray console.log
|
|
8
|
+
- 7efc3dc: feat: add page.goto to cua implementation + prompt edits
|
|
9
|
+
- Updated dependencies [7efc3dc]
|
|
10
|
+
- @empiricalrun/llm@0.14.3
|
|
11
|
+
|
|
3
12
|
## 0.53.3
|
|
4
13
|
|
|
5
14
|
### Patch Changes
|
|
@@ -2,6 +2,10 @@ import { ResponseComputerToolCall } from "openai/resources/responses/responses.m
|
|
|
2
2
|
import type { Page } from "playwright";
|
|
3
3
|
type ComputerAction = ResponseComputerToolCall.Click | ResponseComputerToolCall.DoubleClick | ResponseComputerToolCall.Drag | ResponseComputerToolCall.Keypress | ResponseComputerToolCall.Move | ResponseComputerToolCall.Screenshot | ResponseComputerToolCall.Scroll | ResponseComputerToolCall.Type | ResponseComputerToolCall.Wait;
|
|
4
4
|
export declare function getScreenshot(page: Page): Promise<string>;
|
|
5
|
+
export declare function handlePageGoto(page: Page, url: string): Promise<{
|
|
6
|
+
actionSummary: string;
|
|
7
|
+
actionCode: string;
|
|
8
|
+
}>;
|
|
5
9
|
export declare function handleModelAction(page: Page, action: ComputerAction): Promise<{
|
|
6
10
|
actionSummary: string;
|
|
7
11
|
actionCode: string;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"computer.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/computer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,wBAAwB,EAAE,MAAM,0CAA0C,CAAC;AACpF,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC,KAAK,cAAc,GACf,wBAAwB,CAAC,KAAK,GAC9B,wBAAwB,CAAC,WAAW,GACpC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,QAAQ,GACjC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,UAAU,GACnC,wBAAwB,CAAC,MAAM,GAC/B,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,IAAI,CAAC;AAElC,wBAAsB,aAAa,CAAC,IAAI,EAAE,IAAI,mBAG7C;AAgCD,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAqID"}
|
|
1
|
+
{"version":3,"file":"computer.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/computer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,wBAAwB,EAAE,MAAM,0CAA0C,CAAC;AACpF,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC,KAAK,cAAc,GACf,wBAAwB,CAAC,KAAK,GAC9B,wBAAwB,CAAC,WAAW,GACpC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,QAAQ,GACjC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,UAAU,GACnC,wBAAwB,CAAC,MAAM,GAC/B,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,IAAI,CAAC;AAElC,wBAAsB,aAAa,CAAC,IAAI,EAAE,IAAI,mBAG7C;AAgCD,wBAAsB,cAAc,CAClC,IAAI,EAAE,IAAI,EACV,GAAG,EAAE,MAAM,GACV,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAMD;AAED,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAqID"}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.handleModelAction = exports.getScreenshot = void 0;
|
|
3
|
+
exports.handleModelAction = exports.handlePageGoto = exports.getScreenshot = void 0;
|
|
4
4
|
async function getScreenshot(page) {
|
|
5
5
|
const screenshotBytes = await page.screenshot();
|
|
6
6
|
return Buffer.from(screenshotBytes).toString("base64");
|
|
@@ -35,6 +35,14 @@ const CUA_KEY_TO_PLAYWRIGHT_KEY = {
|
|
|
35
35
|
tab: "Tab",
|
|
36
36
|
win: "Meta",
|
|
37
37
|
};
|
|
38
|
+
async function handlePageGoto(page, url) {
|
|
39
|
+
await page.goto(url);
|
|
40
|
+
return {
|
|
41
|
+
actionSummary: `Navigated page to ${url}`,
|
|
42
|
+
actionCode: `await page.goto("${url}");\n`,
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
exports.handlePageGoto = handlePageGoto;
|
|
38
46
|
async function handleModelAction(page, action) {
|
|
39
47
|
const actionType = action.type;
|
|
40
48
|
let actionCode = "";
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/index.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/index.ts"],"names":[],"mappings":"AASA,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAOlC,wBAAsB,sBAAsB,CAAC,IAAI,EAAE,IAAI,iBAoBtD;AAED,wBAAsB,+BAA+B,CAAC,EACpD,IAAI,EACJ,IAAI,GACL,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;CACd,GAAG,OAAO,CAAC;IACV,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;CACxB,CAAC,CAmMD"}
|
package/dist/agent/cua/index.js
CHANGED
|
@@ -58,7 +58,7 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
|
|
|
58
58
|
content: [
|
|
59
59
|
{
|
|
60
60
|
type: "input_text",
|
|
61
|
-
text: task
|
|
61
|
+
text: `Task to execute: ${task}\n\nCurrent page URL: ${page.url()}`,
|
|
62
62
|
},
|
|
63
63
|
{
|
|
64
64
|
type: "input_image",
|
|
@@ -85,7 +85,8 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
|
|
|
85
85
|
input: { response },
|
|
86
86
|
});
|
|
87
87
|
const computerCalls = response.output.filter((item) => item.type === "computer_call");
|
|
88
|
-
|
|
88
|
+
const functionCalls = response.output.filter((item) => item.type === "function_call");
|
|
89
|
+
if (computerCalls.length === 0 && functionCalls.length === 0) {
|
|
89
90
|
const assistantOutput = response.output.find((item) => item.type === "message");
|
|
90
91
|
if (assistantOutput) {
|
|
91
92
|
const content = assistantOutput.content.find((item) => item.type === "output_text");
|
|
@@ -105,46 +106,75 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
|
|
|
105
106
|
actionsSummary.push(`Action reasoning: ${summaryText}`);
|
|
106
107
|
}
|
|
107
108
|
}
|
|
108
|
-
// We expect
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
const
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
actionsSummary.push(`
|
|
118
|
-
|
|
109
|
+
// We expect either a function call or a computer call in the response.
|
|
110
|
+
let toolCallOutput;
|
|
111
|
+
let executedActionSummary = "";
|
|
112
|
+
// We are assuming only one function call per response
|
|
113
|
+
const functionCall = functionCalls[0];
|
|
114
|
+
if (functionCall) {
|
|
115
|
+
const args = JSON.parse(functionCall.arguments);
|
|
116
|
+
const { actionSummary, actionCode } = await (0, computer_1.handlePageGoto)(page, args.url);
|
|
117
|
+
executedActionSummary = actionSummary;
|
|
118
|
+
actionsSummary.push(`Action executed: ${actionSummary}`);
|
|
119
|
+
if (actionCode) {
|
|
120
|
+
actionsSummary.push(`Generated code: ${actionCode}`);
|
|
121
|
+
generatedCode += actionCode;
|
|
122
|
+
}
|
|
123
|
+
toolCallOutput = {
|
|
124
|
+
type: "function_call_output",
|
|
125
|
+
call_id: functionCall.call_id,
|
|
126
|
+
output: `Navigating page to ${args.url}`,
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
else if (computerCalls.length >= 1) {
|
|
130
|
+
// We expect at most one computer call per response.
|
|
131
|
+
const computerCall = computerCalls[0];
|
|
132
|
+
const action = computerCall.action;
|
|
133
|
+
// Execute the action and take a screenshot
|
|
134
|
+
const { actionSummary, actionCode } = await (0, computer_1.handleModelAction)(page, action);
|
|
135
|
+
executedActionSummary = actionSummary;
|
|
136
|
+
actionsSummary.push(`Action executed: ${actionSummary}`);
|
|
137
|
+
if (actionCode) {
|
|
138
|
+
actionsSummary.push(`Generated code: ${actionCode}`);
|
|
139
|
+
generatedCode += actionCode;
|
|
140
|
+
}
|
|
141
|
+
else {
|
|
142
|
+
actionsSummary.push(`No code generated: Will rely on Playwright's ability to auto-wait or auto-scroll`);
|
|
143
|
+
}
|
|
144
|
+
// Allow time for changes to take effect.
|
|
145
|
+
await new Promise((resolve) => setTimeout(resolve, 1000));
|
|
146
|
+
const screenshotBytes = await (0, computer_1.getScreenshot)(page);
|
|
147
|
+
// Populate toolCallOutput
|
|
148
|
+
toolCallOutput = {
|
|
149
|
+
type: "computer_call_output",
|
|
150
|
+
call_id: computerCall.call_id,
|
|
151
|
+
output: {
|
|
152
|
+
type: "computer_screenshot",
|
|
153
|
+
image_url: `data:image/png;base64,${screenshotBytes}`,
|
|
154
|
+
},
|
|
155
|
+
acknowledged_safety_checks: computerCall.pending_safety_checks,
|
|
156
|
+
};
|
|
119
157
|
}
|
|
120
158
|
else {
|
|
121
|
-
|
|
159
|
+
throw new Error("No tool call found in response.");
|
|
122
160
|
}
|
|
123
|
-
// Allow time for changes to take effect.
|
|
124
|
-
await new Promise((resolve) => setTimeout(resolve, 1000));
|
|
125
|
-
const screenshotBytes = await (0, computer_1.getScreenshot)(page);
|
|
126
|
-
// Send the screenshot back as a computer_call_output
|
|
127
|
-
const computerCallSpan = iterationSpan?.span({
|
|
128
|
-
name: "computer-call-output",
|
|
129
|
-
input: { lastCallId, acknowledged_safety_checks: pendingSafetyChecks },
|
|
130
|
-
});
|
|
131
161
|
response = await (0, model_1.callComputerUseModel)({
|
|
132
162
|
previousResponseId: response.id,
|
|
133
163
|
input: [
|
|
164
|
+
toolCallOutput,
|
|
134
165
|
{
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
166
|
+
role: "user",
|
|
167
|
+
content: [
|
|
168
|
+
{
|
|
169
|
+
type: "input_text",
|
|
170
|
+
text: `Action executed: ${executedActionSummary || "None"}\nCurrent page URL: ${page.url()}`,
|
|
171
|
+
},
|
|
172
|
+
],
|
|
142
173
|
},
|
|
143
174
|
],
|
|
144
175
|
screenWidth,
|
|
145
176
|
screenHeight,
|
|
146
177
|
});
|
|
147
|
-
computerCallSpan?.end({ output: response });
|
|
148
178
|
iterationSpan?.end({ output: response });
|
|
149
179
|
}
|
|
150
180
|
if (!isTaskDone) {
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { Response,
|
|
1
|
+
import { Response, ResponseInputItem } from "openai/resources/responses/responses.mjs";
|
|
2
2
|
export declare function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, }: {
|
|
3
|
-
input:
|
|
3
|
+
input: ResponseInputItem[];
|
|
4
4
|
previousResponseId?: string;
|
|
5
5
|
screenWidth: number;
|
|
6
6
|
screenHeight: number;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"model.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/model.ts"],"names":[],"mappings":"AACA,OAAO,
|
|
1
|
+
{"version":3,"file":"model.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/model.ts"],"names":[],"mappings":"AACA,OAAO,EAEL,QAAQ,EACR,iBAAiB,EAClB,MAAM,0CAA0C,CAAC;AA8BlD,wBAAsB,oBAAoB,CAAC,EACzC,KAAK,EACL,kBAAkB,EAClB,WAAW,EACX,YAAY,GACb,EAAE;IACD,KAAK,EAAE,iBAAiB,EAAE,CAAC;IAC3B,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CAAC,QAAQ,CAAC,CAuBpB"}
|
package/dist/agent/cua/model.js
CHANGED
|
@@ -13,12 +13,31 @@ you click on the submit button -- even if it looks like a scary action.
|
|
|
13
13
|
|
|
14
14
|
If you have been asked to retrieve text or verify something on the UI, then communicate
|
|
15
15
|
that in your responses so that the user can see your thinking process in its entirety.`;
|
|
16
|
+
const pageGotoTool = {
|
|
17
|
+
type: "function",
|
|
18
|
+
name: "page_goto",
|
|
19
|
+
description: "Navigate to a given URL (e.g. https://www.openai.com). Call this if you are looking at a blank page or a new page.",
|
|
20
|
+
parameters: {
|
|
21
|
+
type: "object",
|
|
22
|
+
properties: {
|
|
23
|
+
url: {
|
|
24
|
+
type: "string",
|
|
25
|
+
description: "The URL to navigate to",
|
|
26
|
+
},
|
|
27
|
+
},
|
|
28
|
+
additionalProperties: false,
|
|
29
|
+
required: ["url"],
|
|
30
|
+
},
|
|
31
|
+
strict: true,
|
|
32
|
+
};
|
|
16
33
|
async function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, }) {
|
|
17
34
|
const openai = new openai_1.default();
|
|
18
35
|
return await openai.responses.create({
|
|
19
36
|
model: "computer-use-preview-2025-03-11",
|
|
20
37
|
previous_response_id: previousResponseId,
|
|
38
|
+
parallel_tool_calls: false,
|
|
21
39
|
tools: [
|
|
40
|
+
pageGotoTool,
|
|
22
41
|
{
|
|
23
42
|
type: "computer-preview",
|
|
24
43
|
display_width: screenWidth,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@empiricalrun/test-gen",
|
|
3
|
-
"version": "0.53.
|
|
3
|
+
"version": "0.53.4",
|
|
4
4
|
"publishConfig": {
|
|
5
5
|
"registry": "https://registry.npmjs.org/",
|
|
6
6
|
"access": "public"
|
|
@@ -68,7 +68,7 @@
|
|
|
68
68
|
"tsx": "^4.16.2",
|
|
69
69
|
"typescript": "^5.3.3",
|
|
70
70
|
"zod": "^3.23.8",
|
|
71
|
-
"@empiricalrun/llm": "^0.14.
|
|
71
|
+
"@empiricalrun/llm": "^0.14.3",
|
|
72
72
|
"@empiricalrun/r2-uploader": "^0.3.8",
|
|
73
73
|
"@empiricalrun/test-run": "^0.7.6"
|
|
74
74
|
},
|