npm - @empiricalrun/test-gen - Versions diffs - 0.53.3 → 0.53.4 - Mend

@empiricalrun/test-gen 0.53.3 → 0.53.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/CHANGELOG.md +9 -0
package/dist/agent/cua/computer.d.ts +4 -0
package/dist/agent/cua/computer.d.ts.map +1 -1
package/dist/agent/cua/computer.js +9 -1
package/dist/agent/cua/index.d.ts.map +1 -1
package/dist/agent/cua/index.js +60 -30
package/dist/agent/cua/model.d.ts +2 -2
package/dist/agent/cua/model.d.ts.map +1 -1
package/dist/agent/cua/model.js +19 -0
package/package.json +2 -2

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,14 @@
 # @empiricalrun/test-gen
+## 0.53.4
+### Patch Changes
+- 1426372: fix: remove stray console.log
+- 7efc3dc: feat: add page.goto to cua implementation + prompt edits
+- Updated dependencies [7efc3dc]
+  - @empiricalrun/llm@0.14.3
 ## 0.53.3
 ### Patch Changes

package/dist/agent/cua/computer.d.ts CHANGED Viewed

@@ -2,6 +2,10 @@ import { ResponseComputerToolCall } from "openai/resources/responses/responses.m
 import type { Page } from "playwright";
 type ComputerAction = ResponseComputerToolCall.Click | ResponseComputerToolCall.DoubleClick | ResponseComputerToolCall.Drag | ResponseComputerToolCall.Keypress | ResponseComputerToolCall.Move | ResponseComputerToolCall.Screenshot | ResponseComputerToolCall.Scroll | ResponseComputerToolCall.Type | ResponseComputerToolCall.Wait;
 export declare function getScreenshot(page: Page): Promise<string>;
+export declare function handlePageGoto(page: Page, url: string): Promise<{
+    actionSummary: string;
+    actionCode: string;
+}>;
 export declare function handleModelAction(page: Page, action: ComputerAction): Promise<{
     actionSummary: string;
     actionCode: string;

package/dist/agent/cua/computer.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"computer.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/computer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,wBAAwB,EAAE,MAAM,0CAA0C,CAAC;AACpF,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC,KAAK,cAAc,GACf,wBAAwB,CAAC,KAAK,GAC9B,wBAAwB,CAAC,WAAW,GACpC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,QAAQ,GACjC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,UAAU,GACnC,wBAAwB,CAAC,MAAM,GAC/B,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,IAAI,CAAC;AAElC,wBAAsB,aAAa,CAAC,IAAI,EAAE,IAAI,mBAG7C;AAgCD,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAqID"}
1	+ {"version":3,"file":"computer.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/computer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,wBAAwB,EAAE,MAAM,0CAA0C,CAAC;AACpF,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC,KAAK,cAAc,GACf,wBAAwB,CAAC,KAAK,GAC9B,wBAAwB,CAAC,WAAW,GACpC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,QAAQ,GACjC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,UAAU,GACnC,wBAAwB,CAAC,MAAM,GAC/B,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,IAAI,CAAC;AAElC,wBAAsB,aAAa,CAAC,IAAI,EAAE,IAAI,mBAG7C;AAgCD,wBAAsB,cAAc,CAClC,IAAI,EAAE,IAAI,EACV,GAAG,EAAE,MAAM,GACV,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAMD;AAED,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAqID"}

package/dist/agent/cua/computer.js CHANGED Viewed

@@ -1,6 +1,6 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.handleModelAction = exports.getScreenshot = void 0;
+exports.handleModelAction = exports.handlePageGoto = exports.getScreenshot = void 0;
 async function getScreenshot(page) {
     const screenshotBytes = await page.screenshot();
     return Buffer.from(screenshotBytes).toString("base64");
@@ -35,6 +35,14 @@ const CUA_KEY_TO_PLAYWRIGHT_KEY = {
     tab: "Tab",
     win: "Meta",
 };
+async function handlePageGoto(page, url) {
+    await page.goto(url);
+    return {
+        actionSummary: `Navigated page to ${url}`,
+        actionCode: `await page.goto("${url}");\n`,
+    };
+}
+exports.handlePageGoto = handlePageGoto;
 async function handleModelAction(page, action) {
     const actionType = action.type;
     let actionCode = "";

package/dist/agent/cua/index.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/index.ts"],"names":[],"mappings":"~~AAOA~~,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAOlC,wBAAsB,sBAAsB,CAAC,IAAI,EAAE,IAAI,iBAoBtD;AAED,wBAAsB,+BAA+B,CAAC,EACpD,IAAI,EACJ,IAAI,GACL,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;CACd,GAAG,OAAO,CAAC;IACV,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;CACxB,CAAC,~~CA2JD~~"}
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/index.ts"],"names":[],"mappings":"AASA,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAOlC,wBAAsB,sBAAsB,CAAC,IAAI,EAAE,IAAI,iBAoBtD;AAED,wBAAsB,+BAA+B,CAAC,EACpD,IAAI,EACJ,IAAI,GACL,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;CACd,GAAG,OAAO,CAAC;IACV,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;CACxB,CAAC,CAmMD"}

package/dist/agent/cua/index.js CHANGED Viewed

@@ -58,7 +58,7 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
                 content: [
                     {
                         type: "input_text",
-                        text: task,
+                        text: `Task to execute: ${task}\n\nCurrent page URL: ${page.url()}`,
                     },
                     {
                         type: "input_image",
@@ -85,7 +85,8 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
             input: { response },
         });
         const computerCalls = response.output.filter((item) => item.type === "computer_call");
-        if (computerCalls.length === 0) {
+        const functionCalls = response.output.filter((item) => item.type === "function_call");
+        if (computerCalls.length === 0 && functionCalls.length === 0) {
             const assistantOutput = response.output.find((item) => item.type === "message");
             if (assistantOutput) {
                 const content = assistantOutput.content.find((item) => item.type === "output_text");
@@ -105,46 +106,75 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
                 actionsSummary.push(`Action reasoning: ${summaryText}`);
             }
         }
-        // We expect at most one computer call per response.
-        const computerCall = computerCalls[0];
-        const lastCallId = computerCall.call_id;
-        const action = computerCall.action;
-        const pendingSafetyChecks = computerCall.pending_safety_checks;
-        // Execute the action and take a screenshot
-        const { actionSummary, actionCode } = await (0, computer_1.handleModelAction)(page, action);
-        actionsSummary.push(`Action executed: ${actionSummary}`);
-        if (actionCode) {
-            actionsSummary.push(`Generated code: ${actionCode}`);
-            generatedCode += actionCode;
+        // We expect either a function call or a computer call in the response.
+        let toolCallOutput;
+        let executedActionSummary = "";
+        // We are assuming only one function call per response
+        const functionCall = functionCalls[0];
+        if (functionCall) {
+            const args = JSON.parse(functionCall.arguments);
+            const { actionSummary, actionCode } = await (0, computer_1.handlePageGoto)(page, args.url);
+            executedActionSummary = actionSummary;
+            actionsSummary.push(`Action executed: ${actionSummary}`);
+            if (actionCode) {
+                actionsSummary.push(`Generated code: ${actionCode}`);
+                generatedCode += actionCode;
+            }
+            toolCallOutput = {
+                type: "function_call_output",
+                call_id: functionCall.call_id,
+                output: `Navigating page to ${args.url}`,
+            };
+        }
+        else if (computerCalls.length >= 1) {
+            // We expect at most one computer call per response.
+            const computerCall = computerCalls[0];
+            const action = computerCall.action;
+            // Execute the action and take a screenshot
+            const { actionSummary, actionCode } = await (0, computer_1.handleModelAction)(page, action);
+            executedActionSummary = actionSummary;
+            actionsSummary.push(`Action executed: ${actionSummary}`);
+            if (actionCode) {
+                actionsSummary.push(`Generated code: ${actionCode}`);
+                generatedCode += actionCode;
+            }
+            else {
+                actionsSummary.push(`No code generated: Will rely on Playwright's ability to auto-wait or auto-scroll`);
+            }
+            // Allow time for changes to take effect.
+            await new Promise((resolve) => setTimeout(resolve, 1000));
+            const screenshotBytes = await (0, computer_1.getScreenshot)(page);
+            // Populate toolCallOutput
+            toolCallOutput = {
+                type: "computer_call_output",
+                call_id: computerCall.call_id,
+                output: {
+                    type: "computer_screenshot",
+                    image_url: `data:image/png;base64,${screenshotBytes}`,
+                },
+                acknowledged_safety_checks: computerCall.pending_safety_checks,
+            };
         }
         else {
-            actionsSummary.push(`No code generated: Will rely on Playwright's ability to auto-wait or auto-scroll`);
+            throw new Error("No tool call found in response.");
         }
-        // Allow time for changes to take effect.
-        await new Promise((resolve) => setTimeout(resolve, 1000));
-        const screenshotBytes = await (0, computer_1.getScreenshot)(page);
-        // Send the screenshot back as a computer_call_output
-        const computerCallSpan = iterationSpan?.span({
-            name: "computer-call-output",
-            input: { lastCallId, acknowledged_safety_checks: pendingSafetyChecks },
-        });
         response = await (0, model_1.callComputerUseModel)({
             previousResponseId: response.id,
             input: [
+                toolCallOutput,
                 {
-                    call_id: lastCallId,
-                    type: "computer_call_output",
-                    output: {
-                        type: "computer_screenshot",
-                        image_url: `data:image/png;base64,${screenshotBytes}`,
-                    },
-                    acknowledged_safety_checks: pendingSafetyChecks,
+                    role: "user",
+                    content: [
+                        {
+                            type: "input_text",
+                            text: `Action executed: ${executedActionSummary || "None"}\nCurrent page URL: ${page.url()}`,
+                        },
+                    ],
                 },
             ],
             screenWidth,
             screenHeight,
         });
-        computerCallSpan?.end({ output: response });
         iterationSpan?.end({ output: response });
     }
     if (!isTaskDone) {

package/dist/agent/cua/model.d.ts CHANGED Viewed

@@ -1,6 +1,6 @@
-import { Response, ResponseInput } from "openai/resources/responses/responses.mjs";
+import { Response, ResponseInputItem } from "openai/resources/responses/responses.mjs";
 export declare function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, }: {
-    input: ResponseInput;
+    input: ResponseInputItem[];
     previousResponseId?: string;
     screenWidth: number;
     screenHeight: number;

package/dist/agent/cua/model.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"model.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/model.ts"],"names":[],"mappings":"AACA,OAAO,~~EACL~~,QAAQ,EACR,~~aAAa~~,~~EACd~~,MAAM,0CAA0C,CAAC;~~AAWlD~~,wBAAsB,oBAAoB,CAAC,EACzC,KAAK,EACL,kBAAkB,EAClB,WAAW,EACX,YAAY,GACb,EAAE;IACD,KAAK,EAAE,~~aAAa~~,CAAC;~~IACrB~~,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CAAC,QAAQ,CAAC,~~CAqBpB~~"}
1	+ {"version":3,"file":"model.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/model.ts"],"names":[],"mappings":"AACA,OAAO,EAEL,QAAQ,EACR,iBAAiB,EAClB,MAAM,0CAA0C,CAAC;AA8BlD,wBAAsB,oBAAoB,CAAC,EACzC,KAAK,EACL,kBAAkB,EAClB,WAAW,EACX,YAAY,GACb,EAAE;IACD,KAAK,EAAE,iBAAiB,EAAE,CAAC;IAC3B,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CAAC,QAAQ,CAAC,CAuBpB"}

package/dist/agent/cua/model.js CHANGED Viewed

@@ -13,12 +13,31 @@ you click on the submit button -- even if it looks like a scary action.
 If you have been asked to retrieve text or verify something on the UI, then communicate
 that in your responses so that the user can see your thinking process in its entirety.`;
+const pageGotoTool = {
+    type: "function",
+    name: "page_goto",
+    description: "Navigate to a given URL (e.g. https://www.openai.com). Call this if you are looking at a blank page or a new page.",
+    parameters: {
+        type: "object",
+        properties: {
+            url: {
+                type: "string",
+                description: "The URL to navigate to",
+            },
+        },
+        additionalProperties: false,
+        required: ["url"],
+    },
+    strict: true,
+};
 async function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, }) {
     const openai = new openai_1.default();
     return await openai.responses.create({
         model: "computer-use-preview-2025-03-11",
         previous_response_id: previousResponseId,
+        parallel_tool_calls: false,
         tools: [
+            pageGotoTool,
             {
                 type: "computer-preview",
                 display_width: screenWidth,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@empiricalrun/test-gen",
-  "version": "0.53.3",
+  "version": "0.53.4",
   "publishConfig": {
     "registry": "https://registry.npmjs.org/",
     "access": "public"
@@ -68,7 +68,7 @@
     "tsx": "^4.16.2",
     "typescript": "^5.3.3",
     "zod": "^3.23.8",
-    "@empiricalrun/llm": "^0.14.2",
+    "@empiricalrun/llm": "^0.14.3",
     "@empiricalrun/r2-uploader": "^0.3.8",
     "@empiricalrun/test-run": "^0.7.6"
   },