npm - @empiricalrun/test-gen - Versions diffs - 0.53.3 → 0.53.5 - Mend

@empiricalrun/test-gen 0.53.3 → 0.53.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/CHANGELOG.md +17 -0
package/dist/agent/cua/computer.d.ts +4 -0
package/dist/agent/cua/computer.d.ts.map +1 -1
package/dist/agent/cua/computer.js +9 -1
package/dist/agent/cua/index.d.ts +3 -1
package/dist/agent/cua/index.d.ts.map +1 -1
package/dist/agent/cua/index.js +70 -52
package/dist/agent/cua/model.d.ts +5 -3
package/dist/agent/cua/model.d.ts.map +1 -1
package/dist/agent/cua/model.js +22 -7
package/package.json +2 -2

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,22 @@
 # @empiricalrun/test-gen
+## 0.53.5
+### Patch Changes
+- 9f3cb10: feat: automated tracing for LLM call overlay dismiss
+- Updated dependencies [9f3cb10]
+  - @empiricalrun/llm@0.14.4
+## 0.53.4
+### Patch Changes
+- 1426372: fix: remove stray console.log
+- 7efc3dc: feat: add page.goto to cua implementation + prompt edits
+- Updated dependencies [7efc3dc]
+  - @empiricalrun/llm@0.14.3
 ## 0.53.3
 ### Patch Changes

package/dist/agent/cua/computer.d.ts CHANGED Viewed

@@ -2,6 +2,10 @@ import { ResponseComputerToolCall } from "openai/resources/responses/responses.m
 import type { Page } from "playwright";
 type ComputerAction = ResponseComputerToolCall.Click | ResponseComputerToolCall.DoubleClick | ResponseComputerToolCall.Drag | ResponseComputerToolCall.Keypress | ResponseComputerToolCall.Move | ResponseComputerToolCall.Screenshot | ResponseComputerToolCall.Scroll | ResponseComputerToolCall.Type | ResponseComputerToolCall.Wait;
 export declare function getScreenshot(page: Page): Promise<string>;
+export declare function handlePageGoto(page: Page, url: string): Promise<{
+    actionSummary: string;
+    actionCode: string;
+}>;
 export declare function handleModelAction(page: Page, action: ComputerAction): Promise<{
     actionSummary: string;
     actionCode: string;

package/dist/agent/cua/computer.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"computer.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/computer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,wBAAwB,EAAE,MAAM,0CAA0C,CAAC;AACpF,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC,KAAK,cAAc,GACf,wBAAwB,CAAC,KAAK,GAC9B,wBAAwB,CAAC,WAAW,GACpC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,QAAQ,GACjC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,UAAU,GACnC,wBAAwB,CAAC,MAAM,GAC/B,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,IAAI,CAAC;AAElC,wBAAsB,aAAa,CAAC,IAAI,EAAE,IAAI,mBAG7C;AAgCD,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAqID"}
1	+ {"version":3,"file":"computer.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/computer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,wBAAwB,EAAE,MAAM,0CAA0C,CAAC;AACpF,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC,KAAK,cAAc,GACf,wBAAwB,CAAC,KAAK,GAC9B,wBAAwB,CAAC,WAAW,GACpC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,QAAQ,GACjC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,UAAU,GACnC,wBAAwB,CAAC,MAAM,GAC/B,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,IAAI,CAAC;AAElC,wBAAsB,aAAa,CAAC,IAAI,EAAE,IAAI,mBAG7C;AAgCD,wBAAsB,cAAc,CAClC,IAAI,EAAE,IAAI,EACV,GAAG,EAAE,MAAM,GACV,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAMD;AAED,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAqID"}

package/dist/agent/cua/computer.js CHANGED Viewed

@@ -1,6 +1,6 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.handleModelAction = exports.getScreenshot = void 0;
+exports.handleModelAction = exports.handlePageGoto = exports.getScreenshot = void 0;
 async function getScreenshot(page) {
     const screenshotBytes = await page.screenshot();
     return Buffer.from(screenshotBytes).toString("base64");
@@ -35,6 +35,14 @@ const CUA_KEY_TO_PLAYWRIGHT_KEY = {
     tab: "Tab",
     win: "Meta",
 };
+async function handlePageGoto(page, url) {
+    await page.goto(url);
+    return {
+        actionSummary: `Navigated page to ${url}`,
+        actionCode: `await page.goto("${url}");\n`,
+    };
+}
+exports.handlePageGoto = handlePageGoto;
 async function handleModelAction(page, action) {
     const actionType = action.type;
     let actionCode = "";

package/dist/agent/cua/index.d.ts CHANGED Viewed

@@ -1,8 +1,10 @@
+import { TraceClient } from "@empiricalrun/llm";
 import { Page } from "playwright";
 export declare function startPlaywrightCodegen(page: Page): Promise<void>;
-export declare function createTestUsingComputerUseAgent({ page, task, }: {
+export declare function createTestUsingComputerUseAgent({ page, task, trace, }: {
     page: Page;
     task: string;
+    trace?: TraceClient;
 }): Promise<{
     code: string;
     importPaths: string[];

package/dist/agent/cua/index.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/index.ts"],"names":[],"mappings":"~~AAOA~~,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;~~AAOlC~~,wBAAsB,sBAAsB,CAAC,IAAI,EAAE,IAAI,iBAoBtD;AAED,wBAAsB,+BAA+B,CAAC,EACpD,IAAI,EACJ,IAAI,~~GACL~~,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;~~CACd~~,GAAG,OAAO,CAAC;IACV,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;CACxB,CAAC,~~CA2JD~~"}
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAiB,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAS/D,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAMlC,wBAAsB,sBAAsB,CAAC,IAAI,EAAE,IAAI,iBAoBtD;AAED,wBAAsB,+BAA+B,CAAC,EACpD,IAAI,EACJ,IAAI,EACJ,KAAK,GACN,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,WAAW,CAAC;CACrB,GAAG,OAAO,CAAC;IACV,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;CACxB,CAAC,CAqLD"}

package/dist/agent/cua/index.js CHANGED Viewed

@@ -5,8 +5,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.createTestUsingComputerUseAgent = exports.startPlaywrightCodegen = void 0;
 const llm_1 = require("@empiricalrun/llm");
-const crypto_1 = __importDefault(require("crypto"));
-const logger_1 = require("../../bin/logger");
+const openai_1 = __importDefault(require("openai"));
 const utils_1 = require("../browsing/utils");
 const computer_1 = require("./computer");
 const model_1 = require("./model");
@@ -32,25 +31,18 @@ async function startPlaywrightCodegen(page) {
     await page.pause();
 }
 exports.startPlaywrightCodegen = startPlaywrightCodegen;
-async function createTestUsingComputerUseAgent({ page, task, }) {
+async function createTestUsingComputerUseAgent({ page, task, trace, }) {
     await (0, utils_1.injectPwLocatorGenerator)(page);
     const screenshotBytes = await (0, computer_1.getScreenshot)(page);
     const viewport = page.viewportSize();
     let screenWidth = viewport?.width || 1280;
     let screenHeight = viewport?.height || 720;
-    const logger = new logger_1.CustomLogger({ useReporter: false });
-    const trace = llm_1.langfuseInstance?.trace({
-        name: "computer-use-agent",
-        id: crypto_1.default.randomUUID(),
-        input: { task },
-    });
-    if (trace) {
-        const traceUrl = trace.getTraceUrl();
-        logger.log(`Starting computer use agent: ${traceUrl}`);
-    }
-    const span = trace?.span({
-        name: "initial-model-call",
-    });
+    const openAIClient = trace
+        ? (0, llm_1.observeOpenAI)(new openai_1.default(), {
+            generationName: `computer-use-agent`,
+            parent: trace,
+        })
+        : new openai_1.default();
     let response = await (0, model_1.callComputerUseModel)({
         input: [
             {
@@ -58,7 +50,7 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
                 content: [
                     {
                         type: "input_text",
-                        text: task,
+                        text: `Task to execute: ${task}\n\nCurrent page URL: ${page.url()}`,
                     },
                     {
                         type: "input_image",
@@ -70,8 +62,8 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
         ],
         screenWidth,
         screenHeight,
+        openAIClient,
     });
-    span?.end({ output: response });
     let isTaskDone = false;
     let maxIterations = 15;
     let generatedCode = "";
@@ -80,12 +72,9 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
     while (!isTaskDone && iterationIndex < maxIterations) {
         actionsSummary.push(`\n# Agent iteration ${iterationIndex}`);
         iterationIndex++;
-        const iterationSpan = trace?.span({
-            name: `iteration-${iterationIndex}`,
-            input: { response },
-        });
         const computerCalls = response.output.filter((item) => item.type === "computer_call");
-        if (computerCalls.length === 0) {
+        const functionCalls = response.output.filter((item) => item.type === "function_call");
+        if (computerCalls.length === 0 && functionCalls.length === 0) {
             const assistantOutput = response.output.find((item) => item.type === "message");
             if (assistantOutput) {
                 const content = assistantOutput.content.find((item) => item.type === "output_text");
@@ -105,47 +94,76 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
                 actionsSummary.push(`Action reasoning: ${summaryText}`);
             }
         }
-        // We expect at most one computer call per response.
-        const computerCall = computerCalls[0];
-        const lastCallId = computerCall.call_id;
-        const action = computerCall.action;
-        const pendingSafetyChecks = computerCall.pending_safety_checks;
-        // Execute the action and take a screenshot
-        const { actionSummary, actionCode } = await (0, computer_1.handleModelAction)(page, action);
-        actionsSummary.push(`Action executed: ${actionSummary}`);
-        if (actionCode) {
-            actionsSummary.push(`Generated code: ${actionCode}`);
-            generatedCode += actionCode;
+        // We expect either a function call or a computer call in the response.
+        let toolCallOutput;
+        let executedActionSummary = "";
+        // We are assuming only one function call per response
+        const functionCall = functionCalls[0];
+        if (functionCall) {
+            const args = JSON.parse(functionCall.arguments);
+            const { actionSummary, actionCode } = await (0, computer_1.handlePageGoto)(page, args.url);
+            executedActionSummary = actionSummary;
+            actionsSummary.push(`Action executed: ${actionSummary}`);
+            if (actionCode) {
+                actionsSummary.push(`Generated code: ${actionCode}`);
+                generatedCode += actionCode;
+            }
+            toolCallOutput = {
+                type: "function_call_output",
+                call_id: functionCall.call_id,
+                output: `Navigating page to ${args.url}`,
+            };
+        }
+        else if (computerCalls.length >= 1) {
+            // We expect at most one computer call per response.
+            const computerCall = computerCalls[0];
+            const action = computerCall.action;
+            // Execute the action and take a screenshot
+            const { actionSummary, actionCode } = await (0, computer_1.handleModelAction)(page, action);
+            executedActionSummary = actionSummary;
+            actionsSummary.push(`Action executed: ${actionSummary}`);
+            if (actionCode) {
+                actionsSummary.push(`Generated code: ${actionCode}`);
+                generatedCode += actionCode;
+            }
+            else {
+                actionsSummary.push(`No code generated: Will rely on Playwright's ability to auto-wait or auto-scroll`);
+            }
+            // Allow time for changes to take effect.
+            await new Promise((resolve) => setTimeout(resolve, 1000));
+            const screenshotBytes = await (0, computer_1.getScreenshot)(page);
+            // Populate toolCallOutput
+            toolCallOutput = {
+                type: "computer_call_output",
+                call_id: computerCall.call_id,
+                output: {
+                    type: "computer_screenshot",
+                    image_url: `data:image/png;base64,${screenshotBytes}`,
+                },
+                acknowledged_safety_checks: computerCall.pending_safety_checks,
+            };
         }
         else {
-            actionsSummary.push(`No code generated: Will rely on Playwright's ability to auto-wait or auto-scroll`);
+            throw new Error("No tool call found in response.");
         }
-        // Allow time for changes to take effect.
-        await new Promise((resolve) => setTimeout(resolve, 1000));
-        const screenshotBytes = await (0, computer_1.getScreenshot)(page);
-        // Send the screenshot back as a computer_call_output
-        const computerCallSpan = iterationSpan?.span({
-            name: "computer-call-output",
-            input: { lastCallId, acknowledged_safety_checks: pendingSafetyChecks },
-        });
         response = await (0, model_1.callComputerUseModel)({
             previousResponseId: response.id,
             input: [
+                toolCallOutput,
                 {
-                    call_id: lastCallId,
-                    type: "computer_call_output",
-                    output: {
-                        type: "computer_screenshot",
-                        image_url: `data:image/png;base64,${screenshotBytes}`,
-                    },
-                    acknowledged_safety_checks: pendingSafetyChecks,
+                    role: "user",
+                    content: [
+                        {
+                            type: "input_text",
+                            text: `Action executed: ${executedActionSummary || "None"}\nCurrent page URL: ${page.url()}`,
+                        },
+                    ],
                 },
             ],
             screenWidth,
             screenHeight,
+            openAIClient,
         });
-        computerCallSpan?.end({ output: response });
-        iterationSpan?.end({ output: response });
     }
     if (!isTaskDone) {
         actionsSummary.push(`Max iteration limit hit: Task not done after ${maxIterations} iterations`);

package/dist/agent/cua/model.d.ts CHANGED Viewed

@@ -1,8 +1,10 @@
-import { Response, ResponseInput } from "openai/resources/responses/responses.mjs";
-export declare function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, }: {
-    input: ResponseInput;
+import OpenAI from "openai";
+import { Response, ResponseInputItem } from "openai/resources/responses/responses.mjs";
+export declare function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, openAIClient, }: {
+    input: ResponseInputItem[];
     previousResponseId?: string;
     screenWidth: number;
     screenHeight: number;
+    openAIClient: OpenAI;
 }): Promise<Response>;
 //# sourceMappingURL=model.d.ts.map

package/dist/agent/cua/model.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"model.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/model.ts"],"names":[],"mappings":"~~AACA~~,OAAO,~~EACL~~,QAAQ,EACR,~~aAAa~~,~~EACd~~,MAAM,0CAA0C,CAAC;~~AAWlD~~,wBAAsB,oBAAoB,CAAC,EACzC,KAAK,EACL,kBAAkB,EAClB,WAAW,EACX,YAAY,GACb,EAAE;IACD,KAAK,EAAE,~~aAAa~~,CAAC;~~IACrB~~,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CAAC,QAAQ,CAAC,~~CAqBpB~~"}
1	+ {"version":3,"file":"model.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/model.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,QAAQ,CAAC;AAC5B,OAAO,EAEL,QAAQ,EACR,iBAAiB,EAClB,MAAM,0CAA0C,CAAC;AA8BlD,wBAAsB,oBAAoB,CAAC,EACzC,KAAK,EACL,kBAAkB,EAClB,WAAW,EACX,YAAY,EACZ,YAAY,GACb,EAAE;IACD,KAAK,EAAE,iBAAiB,EAAE,CAAC;IAC3B,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CAAC,QAAQ,CAAC,CAuBpB"}

package/dist/agent/cua/model.js CHANGED Viewed

@@ -1,10 +1,6 @@
 "use strict";
-var __importDefault = (this && this.__importDefault) || function (mod) {
-    return (mod && mod.__esModule) ? mod : { "default": mod };
-};
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.callComputerUseModel = void 0;
-const openai_1 = __importDefault(require("openai"));
 const INSTRUCTIONS = `You will be asked to execute some actions in a browser context.
 Don't ask the user for confirmations - just execute the actions.
@@ -13,12 +9,30 @@ you click on the submit button -- even if it looks like a scary action.
 If you have been asked to retrieve text or verify something on the UI, then communicate
 that in your responses so that the user can see your thinking process in its entirety.`;
-async function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, }) {
-    const openai = new openai_1.default();
-    return await openai.responses.create({
+const pageGotoTool = {
+    type: "function",
+    name: "page_goto",
+    description: "Navigate to a given URL (e.g. https://www.openai.com). Call this if you are looking at a blank page or a new page.",
+    parameters: {
+        type: "object",
+        properties: {
+            url: {
+                type: "string",
+                description: "The URL to navigate to",
+            },
+        },
+        additionalProperties: false,
+        required: ["url"],
+    },
+    strict: true,
+};
+async function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, openAIClient, }) {
+    const response = await openAIClient.responses.create({
         model: "computer-use-preview-2025-03-11",
         previous_response_id: previousResponseId,
+        parallel_tool_calls: false,
         tools: [
+            pageGotoTool,
             {
                 type: "computer-preview",
                 display_width: screenWidth,
@@ -34,5 +48,6 @@ async function callComputerUseModel({ input, previousResponseId, screenWidth, sc
         input,
         truncation: "auto",
     });
+    return response;
 }
 exports.callComputerUseModel = callComputerUseModel;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@empiricalrun/test-gen",
-  "version": "0.53.3",
+  "version": "0.53.5",
   "publishConfig": {
     "registry": "https://registry.npmjs.org/",
     "access": "public"
@@ -68,7 +68,7 @@
     "tsx": "^4.16.2",
     "typescript": "^5.3.3",
     "zod": "^3.23.8",
-    "@empiricalrun/llm": "^0.14.2",
+    "@empiricalrun/llm": "^0.14.4",
     "@empiricalrun/r2-uploader": "^0.3.8",
     "@empiricalrun/test-run": "^0.7.6"
   },