@empiricalrun/test-gen 0.53.3 → 0.53.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,14 @@
1
1
  # @empiricalrun/test-gen
2
2
 
3
+ ## 0.53.4
4
+
5
+ ### Patch Changes
6
+
7
+ - 1426372: fix: remove stray console.log
8
+ - 7efc3dc: feat: add page.goto to cua implementation + prompt edits
9
+ - Updated dependencies [7efc3dc]
10
+ - @empiricalrun/llm@0.14.3
11
+
3
12
  ## 0.53.3
4
13
 
5
14
  ### Patch Changes
@@ -2,6 +2,10 @@ import { ResponseComputerToolCall } from "openai/resources/responses/responses.m
2
2
  import type { Page } from "playwright";
3
3
  type ComputerAction = ResponseComputerToolCall.Click | ResponseComputerToolCall.DoubleClick | ResponseComputerToolCall.Drag | ResponseComputerToolCall.Keypress | ResponseComputerToolCall.Move | ResponseComputerToolCall.Screenshot | ResponseComputerToolCall.Scroll | ResponseComputerToolCall.Type | ResponseComputerToolCall.Wait;
4
4
  export declare function getScreenshot(page: Page): Promise<string>;
5
+ export declare function handlePageGoto(page: Page, url: string): Promise<{
6
+ actionSummary: string;
7
+ actionCode: string;
8
+ }>;
5
9
  export declare function handleModelAction(page: Page, action: ComputerAction): Promise<{
6
10
  actionSummary: string;
7
11
  actionCode: string;
@@ -1 +1 @@
1
- {"version":3,"file":"computer.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/computer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,wBAAwB,EAAE,MAAM,0CAA0C,CAAC;AACpF,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC,KAAK,cAAc,GACf,wBAAwB,CAAC,KAAK,GAC9B,wBAAwB,CAAC,WAAW,GACpC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,QAAQ,GACjC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,UAAU,GACnC,wBAAwB,CAAC,MAAM,GAC/B,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,IAAI,CAAC;AAElC,wBAAsB,aAAa,CAAC,IAAI,EAAE,IAAI,mBAG7C;AAgCD,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAqID"}
1
+ {"version":3,"file":"computer.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/computer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,wBAAwB,EAAE,MAAM,0CAA0C,CAAC;AACpF,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC,KAAK,cAAc,GACf,wBAAwB,CAAC,KAAK,GAC9B,wBAAwB,CAAC,WAAW,GACpC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,QAAQ,GACjC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,UAAU,GACnC,wBAAwB,CAAC,MAAM,GAC/B,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,IAAI,CAAC;AAElC,wBAAsB,aAAa,CAAC,IAAI,EAAE,IAAI,mBAG7C;AAgCD,wBAAsB,cAAc,CAClC,IAAI,EAAE,IAAI,EACV,GAAG,EAAE,MAAM,GACV,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAMD;AAED,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAqID"}
@@ -1,6 +1,6 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.handleModelAction = exports.getScreenshot = void 0;
3
+ exports.handleModelAction = exports.handlePageGoto = exports.getScreenshot = void 0;
4
4
  async function getScreenshot(page) {
5
5
  const screenshotBytes = await page.screenshot();
6
6
  return Buffer.from(screenshotBytes).toString("base64");
@@ -35,6 +35,14 @@ const CUA_KEY_TO_PLAYWRIGHT_KEY = {
35
35
  tab: "Tab",
36
36
  win: "Meta",
37
37
  };
38
+ async function handlePageGoto(page, url) {
39
+ await page.goto(url);
40
+ return {
41
+ actionSummary: `Navigated page to ${url}`,
42
+ actionCode: `await page.goto("${url}");\n`,
43
+ };
44
+ }
45
+ exports.handlePageGoto = handlePageGoto;
38
46
  async function handleModelAction(page, action) {
39
47
  const actionType = action.type;
40
48
  let actionCode = "";
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/index.ts"],"names":[],"mappings":"AAOA,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAOlC,wBAAsB,sBAAsB,CAAC,IAAI,EAAE,IAAI,iBAoBtD;AAED,wBAAsB,+BAA+B,CAAC,EACpD,IAAI,EACJ,IAAI,GACL,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;CACd,GAAG,OAAO,CAAC;IACV,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;CACxB,CAAC,CA2JD"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/index.ts"],"names":[],"mappings":"AASA,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAOlC,wBAAsB,sBAAsB,CAAC,IAAI,EAAE,IAAI,iBAoBtD;AAED,wBAAsB,+BAA+B,CAAC,EACpD,IAAI,EACJ,IAAI,GACL,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;CACd,GAAG,OAAO,CAAC;IACV,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;CACxB,CAAC,CAmMD"}
@@ -58,7 +58,7 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
58
58
  content: [
59
59
  {
60
60
  type: "input_text",
61
- text: task,
61
+ text: `Task to execute: ${task}\n\nCurrent page URL: ${page.url()}`,
62
62
  },
63
63
  {
64
64
  type: "input_image",
@@ -85,7 +85,8 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
85
85
  input: { response },
86
86
  });
87
87
  const computerCalls = response.output.filter((item) => item.type === "computer_call");
88
- if (computerCalls.length === 0) {
88
+ const functionCalls = response.output.filter((item) => item.type === "function_call");
89
+ if (computerCalls.length === 0 && functionCalls.length === 0) {
89
90
  const assistantOutput = response.output.find((item) => item.type === "message");
90
91
  if (assistantOutput) {
91
92
  const content = assistantOutput.content.find((item) => item.type === "output_text");
@@ -105,46 +106,75 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
105
106
  actionsSummary.push(`Action reasoning: ${summaryText}`);
106
107
  }
107
108
  }
108
- // We expect at most one computer call per response.
109
- const computerCall = computerCalls[0];
110
- const lastCallId = computerCall.call_id;
111
- const action = computerCall.action;
112
- const pendingSafetyChecks = computerCall.pending_safety_checks;
113
- // Execute the action and take a screenshot
114
- const { actionSummary, actionCode } = await (0, computer_1.handleModelAction)(page, action);
115
- actionsSummary.push(`Action executed: ${actionSummary}`);
116
- if (actionCode) {
117
- actionsSummary.push(`Generated code: ${actionCode}`);
118
- generatedCode += actionCode;
109
+ // We expect either a function call or a computer call in the response.
110
+ let toolCallOutput;
111
+ let executedActionSummary = "";
112
+ // We are assuming only one function call per response
113
+ const functionCall = functionCalls[0];
114
+ if (functionCall) {
115
+ const args = JSON.parse(functionCall.arguments);
116
+ const { actionSummary, actionCode } = await (0, computer_1.handlePageGoto)(page, args.url);
117
+ executedActionSummary = actionSummary;
118
+ actionsSummary.push(`Action executed: ${actionSummary}`);
119
+ if (actionCode) {
120
+ actionsSummary.push(`Generated code: ${actionCode}`);
121
+ generatedCode += actionCode;
122
+ }
123
+ toolCallOutput = {
124
+ type: "function_call_output",
125
+ call_id: functionCall.call_id,
126
+ output: `Navigating page to ${args.url}`,
127
+ };
128
+ }
129
+ else if (computerCalls.length >= 1) {
130
+ // We expect at most one computer call per response.
131
+ const computerCall = computerCalls[0];
132
+ const action = computerCall.action;
133
+ // Execute the action and take a screenshot
134
+ const { actionSummary, actionCode } = await (0, computer_1.handleModelAction)(page, action);
135
+ executedActionSummary = actionSummary;
136
+ actionsSummary.push(`Action executed: ${actionSummary}`);
137
+ if (actionCode) {
138
+ actionsSummary.push(`Generated code: ${actionCode}`);
139
+ generatedCode += actionCode;
140
+ }
141
+ else {
142
+ actionsSummary.push(`No code generated: Will rely on Playwright's ability to auto-wait or auto-scroll`);
143
+ }
144
+ // Allow time for changes to take effect.
145
+ await new Promise((resolve) => setTimeout(resolve, 1000));
146
+ const screenshotBytes = await (0, computer_1.getScreenshot)(page);
147
+ // Populate toolCallOutput
148
+ toolCallOutput = {
149
+ type: "computer_call_output",
150
+ call_id: computerCall.call_id,
151
+ output: {
152
+ type: "computer_screenshot",
153
+ image_url: `data:image/png;base64,${screenshotBytes}`,
154
+ },
155
+ acknowledged_safety_checks: computerCall.pending_safety_checks,
156
+ };
119
157
  }
120
158
  else {
121
- actionsSummary.push(`No code generated: Will rely on Playwright's ability to auto-wait or auto-scroll`);
159
+ throw new Error("No tool call found in response.");
122
160
  }
123
- // Allow time for changes to take effect.
124
- await new Promise((resolve) => setTimeout(resolve, 1000));
125
- const screenshotBytes = await (0, computer_1.getScreenshot)(page);
126
- // Send the screenshot back as a computer_call_output
127
- const computerCallSpan = iterationSpan?.span({
128
- name: "computer-call-output",
129
- input: { lastCallId, acknowledged_safety_checks: pendingSafetyChecks },
130
- });
131
161
  response = await (0, model_1.callComputerUseModel)({
132
162
  previousResponseId: response.id,
133
163
  input: [
164
+ toolCallOutput,
134
165
  {
135
- call_id: lastCallId,
136
- type: "computer_call_output",
137
- output: {
138
- type: "computer_screenshot",
139
- image_url: `data:image/png;base64,${screenshotBytes}`,
140
- },
141
- acknowledged_safety_checks: pendingSafetyChecks,
166
+ role: "user",
167
+ content: [
168
+ {
169
+ type: "input_text",
170
+ text: `Action executed: ${executedActionSummary || "None"}\nCurrent page URL: ${page.url()}`,
171
+ },
172
+ ],
142
173
  },
143
174
  ],
144
175
  screenWidth,
145
176
  screenHeight,
146
177
  });
147
- computerCallSpan?.end({ output: response });
148
178
  iterationSpan?.end({ output: response });
149
179
  }
150
180
  if (!isTaskDone) {
@@ -1,6 +1,6 @@
1
- import { Response, ResponseInput } from "openai/resources/responses/responses.mjs";
1
+ import { Response, ResponseInputItem } from "openai/resources/responses/responses.mjs";
2
2
  export declare function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, }: {
3
- input: ResponseInput;
3
+ input: ResponseInputItem[];
4
4
  previousResponseId?: string;
5
5
  screenWidth: number;
6
6
  screenHeight: number;
@@ -1 +1 @@
1
- {"version":3,"file":"model.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/model.ts"],"names":[],"mappings":"AACA,OAAO,EACL,QAAQ,EACR,aAAa,EACd,MAAM,0CAA0C,CAAC;AAWlD,wBAAsB,oBAAoB,CAAC,EACzC,KAAK,EACL,kBAAkB,EAClB,WAAW,EACX,YAAY,GACb,EAAE;IACD,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CAAC,QAAQ,CAAC,CAqBpB"}
1
+ {"version":3,"file":"model.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/model.ts"],"names":[],"mappings":"AACA,OAAO,EAEL,QAAQ,EACR,iBAAiB,EAClB,MAAM,0CAA0C,CAAC;AA8BlD,wBAAsB,oBAAoB,CAAC,EACzC,KAAK,EACL,kBAAkB,EAClB,WAAW,EACX,YAAY,GACb,EAAE;IACD,KAAK,EAAE,iBAAiB,EAAE,CAAC;IAC3B,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CAAC,QAAQ,CAAC,CAuBpB"}
@@ -13,12 +13,31 @@ you click on the submit button -- even if it looks like a scary action.
13
13
 
14
14
  If you have been asked to retrieve text or verify something on the UI, then communicate
15
15
  that in your responses so that the user can see your thinking process in its entirety.`;
16
+ const pageGotoTool = {
17
+ type: "function",
18
+ name: "page_goto",
19
+ description: "Navigate to a given URL (e.g. https://www.openai.com). Call this if you are looking at a blank page or a new page.",
20
+ parameters: {
21
+ type: "object",
22
+ properties: {
23
+ url: {
24
+ type: "string",
25
+ description: "The URL to navigate to",
26
+ },
27
+ },
28
+ additionalProperties: false,
29
+ required: ["url"],
30
+ },
31
+ strict: true,
32
+ };
16
33
  async function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, }) {
17
34
  const openai = new openai_1.default();
18
35
  return await openai.responses.create({
19
36
  model: "computer-use-preview-2025-03-11",
20
37
  previous_response_id: previousResponseId,
38
+ parallel_tool_calls: false,
21
39
  tools: [
40
+ pageGotoTool,
22
41
  {
23
42
  type: "computer-preview",
24
43
  display_width: screenWidth,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@empiricalrun/test-gen",
3
- "version": "0.53.3",
3
+ "version": "0.53.4",
4
4
  "publishConfig": {
5
5
  "registry": "https://registry.npmjs.org/",
6
6
  "access": "public"
@@ -68,7 +68,7 @@
68
68
  "tsx": "^4.16.2",
69
69
  "typescript": "^5.3.3",
70
70
  "zod": "^3.23.8",
71
- "@empiricalrun/llm": "^0.14.2",
71
+ "@empiricalrun/llm": "^0.14.3",
72
72
  "@empiricalrun/r2-uploader": "^0.3.8",
73
73
  "@empiricalrun/test-run": "^0.7.6"
74
74
  },