@empiricalrun/test-gen 0.53.3 → 0.53.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,22 @@
1
1
  # @empiricalrun/test-gen
2
2
 
3
+ ## 0.53.5
4
+
5
+ ### Patch Changes
6
+
7
+ - 9f3cb10: feat: automated tracing for LLM call overlay dismiss
8
+ - Updated dependencies [9f3cb10]
9
+ - @empiricalrun/llm@0.14.4
10
+
11
+ ## 0.53.4
12
+
13
+ ### Patch Changes
14
+
15
+ - 1426372: fix: remove stray console.log
16
+ - 7efc3dc: feat: add page.goto to cua implementation + prompt edits
17
+ - Updated dependencies [7efc3dc]
18
+ - @empiricalrun/llm@0.14.3
19
+
3
20
  ## 0.53.3
4
21
 
5
22
  ### Patch Changes
@@ -2,6 +2,10 @@ import { ResponseComputerToolCall } from "openai/resources/responses/responses.m
2
2
  import type { Page } from "playwright";
3
3
  type ComputerAction = ResponseComputerToolCall.Click | ResponseComputerToolCall.DoubleClick | ResponseComputerToolCall.Drag | ResponseComputerToolCall.Keypress | ResponseComputerToolCall.Move | ResponseComputerToolCall.Screenshot | ResponseComputerToolCall.Scroll | ResponseComputerToolCall.Type | ResponseComputerToolCall.Wait;
4
4
  export declare function getScreenshot(page: Page): Promise<string>;
5
+ export declare function handlePageGoto(page: Page, url: string): Promise<{
6
+ actionSummary: string;
7
+ actionCode: string;
8
+ }>;
5
9
  export declare function handleModelAction(page: Page, action: ComputerAction): Promise<{
6
10
  actionSummary: string;
7
11
  actionCode: string;
@@ -1 +1 @@
1
- {"version":3,"file":"computer.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/computer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,wBAAwB,EAAE,MAAM,0CAA0C,CAAC;AACpF,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC,KAAK,cAAc,GACf,wBAAwB,CAAC,KAAK,GAC9B,wBAAwB,CAAC,WAAW,GACpC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,QAAQ,GACjC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,UAAU,GACnC,wBAAwB,CAAC,MAAM,GAC/B,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,IAAI,CAAC;AAElC,wBAAsB,aAAa,CAAC,IAAI,EAAE,IAAI,mBAG7C;AAgCD,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAqID"}
1
+ {"version":3,"file":"computer.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/computer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,wBAAwB,EAAE,MAAM,0CAA0C,CAAC;AACpF,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC,KAAK,cAAc,GACf,wBAAwB,CAAC,KAAK,GAC9B,wBAAwB,CAAC,WAAW,GACpC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,QAAQ,GACjC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,UAAU,GACnC,wBAAwB,CAAC,MAAM,GAC/B,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,IAAI,CAAC;AAElC,wBAAsB,aAAa,CAAC,IAAI,EAAE,IAAI,mBAG7C;AAgCD,wBAAsB,cAAc,CAClC,IAAI,EAAE,IAAI,EACV,GAAG,EAAE,MAAM,GACV,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAMD;AAED,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAqID"}
@@ -1,6 +1,6 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.handleModelAction = exports.getScreenshot = void 0;
3
+ exports.handleModelAction = exports.handlePageGoto = exports.getScreenshot = void 0;
4
4
  async function getScreenshot(page) {
5
5
  const screenshotBytes = await page.screenshot();
6
6
  return Buffer.from(screenshotBytes).toString("base64");
@@ -35,6 +35,14 @@ const CUA_KEY_TO_PLAYWRIGHT_KEY = {
35
35
  tab: "Tab",
36
36
  win: "Meta",
37
37
  };
38
+ async function handlePageGoto(page, url) {
39
+ await page.goto(url);
40
+ return {
41
+ actionSummary: `Navigated page to ${url}`,
42
+ actionCode: `await page.goto("${url}");\n`,
43
+ };
44
+ }
45
+ exports.handlePageGoto = handlePageGoto;
38
46
  async function handleModelAction(page, action) {
39
47
  const actionType = action.type;
40
48
  let actionCode = "";
@@ -1,8 +1,10 @@
1
+ import { TraceClient } from "@empiricalrun/llm";
1
2
  import { Page } from "playwright";
2
3
  export declare function startPlaywrightCodegen(page: Page): Promise<void>;
3
- export declare function createTestUsingComputerUseAgent({ page, task, }: {
4
+ export declare function createTestUsingComputerUseAgent({ page, task, trace, }: {
4
5
  page: Page;
5
6
  task: string;
7
+ trace?: TraceClient;
6
8
  }): Promise<{
7
9
  code: string;
8
10
  importPaths: string[];
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/index.ts"],"names":[],"mappings":"AAOA,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAOlC,wBAAsB,sBAAsB,CAAC,IAAI,EAAE,IAAI,iBAoBtD;AAED,wBAAsB,+BAA+B,CAAC,EACpD,IAAI,EACJ,IAAI,GACL,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;CACd,GAAG,OAAO,CAAC;IACV,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;CACxB,CAAC,CA2JD"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAiB,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAS/D,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAMlC,wBAAsB,sBAAsB,CAAC,IAAI,EAAE,IAAI,iBAoBtD;AAED,wBAAsB,+BAA+B,CAAC,EACpD,IAAI,EACJ,IAAI,EACJ,KAAK,GACN,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,WAAW,CAAC;CACrB,GAAG,OAAO,CAAC;IACV,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;CACxB,CAAC,CAqLD"}
@@ -5,8 +5,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
5
5
  Object.defineProperty(exports, "__esModule", { value: true });
6
6
  exports.createTestUsingComputerUseAgent = exports.startPlaywrightCodegen = void 0;
7
7
  const llm_1 = require("@empiricalrun/llm");
8
- const crypto_1 = __importDefault(require("crypto"));
9
- const logger_1 = require("../../bin/logger");
8
+ const openai_1 = __importDefault(require("openai"));
10
9
  const utils_1 = require("../browsing/utils");
11
10
  const computer_1 = require("./computer");
12
11
  const model_1 = require("./model");
@@ -32,25 +31,18 @@ async function startPlaywrightCodegen(page) {
32
31
  await page.pause();
33
32
  }
34
33
  exports.startPlaywrightCodegen = startPlaywrightCodegen;
35
- async function createTestUsingComputerUseAgent({ page, task, }) {
34
+ async function createTestUsingComputerUseAgent({ page, task, trace, }) {
36
35
  await (0, utils_1.injectPwLocatorGenerator)(page);
37
36
  const screenshotBytes = await (0, computer_1.getScreenshot)(page);
38
37
  const viewport = page.viewportSize();
39
38
  let screenWidth = viewport?.width || 1280;
40
39
  let screenHeight = viewport?.height || 720;
41
- const logger = new logger_1.CustomLogger({ useReporter: false });
42
- const trace = llm_1.langfuseInstance?.trace({
43
- name: "computer-use-agent",
44
- id: crypto_1.default.randomUUID(),
45
- input: { task },
46
- });
47
- if (trace) {
48
- const traceUrl = trace.getTraceUrl();
49
- logger.log(`Starting computer use agent: ${traceUrl}`);
50
- }
51
- const span = trace?.span({
52
- name: "initial-model-call",
53
- });
40
+ const openAIClient = trace
41
+ ? (0, llm_1.observeOpenAI)(new openai_1.default(), {
42
+ generationName: `computer-use-agent`,
43
+ parent: trace,
44
+ })
45
+ : new openai_1.default();
54
46
  let response = await (0, model_1.callComputerUseModel)({
55
47
  input: [
56
48
  {
@@ -58,7 +50,7 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
58
50
  content: [
59
51
  {
60
52
  type: "input_text",
61
- text: task,
53
+ text: `Task to execute: ${task}\n\nCurrent page URL: ${page.url()}`,
62
54
  },
63
55
  {
64
56
  type: "input_image",
@@ -70,8 +62,8 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
70
62
  ],
71
63
  screenWidth,
72
64
  screenHeight,
65
+ openAIClient,
73
66
  });
74
- span?.end({ output: response });
75
67
  let isTaskDone = false;
76
68
  let maxIterations = 15;
77
69
  let generatedCode = "";
@@ -80,12 +72,9 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
80
72
  while (!isTaskDone && iterationIndex < maxIterations) {
81
73
  actionsSummary.push(`\n# Agent iteration ${iterationIndex}`);
82
74
  iterationIndex++;
83
- const iterationSpan = trace?.span({
84
- name: `iteration-${iterationIndex}`,
85
- input: { response },
86
- });
87
75
  const computerCalls = response.output.filter((item) => item.type === "computer_call");
88
- if (computerCalls.length === 0) {
76
+ const functionCalls = response.output.filter((item) => item.type === "function_call");
77
+ if (computerCalls.length === 0 && functionCalls.length === 0) {
89
78
  const assistantOutput = response.output.find((item) => item.type === "message");
90
79
  if (assistantOutput) {
91
80
  const content = assistantOutput.content.find((item) => item.type === "output_text");
@@ -105,47 +94,76 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
105
94
  actionsSummary.push(`Action reasoning: ${summaryText}`);
106
95
  }
107
96
  }
108
- // We expect at most one computer call per response.
109
- const computerCall = computerCalls[0];
110
- const lastCallId = computerCall.call_id;
111
- const action = computerCall.action;
112
- const pendingSafetyChecks = computerCall.pending_safety_checks;
113
- // Execute the action and take a screenshot
114
- const { actionSummary, actionCode } = await (0, computer_1.handleModelAction)(page, action);
115
- actionsSummary.push(`Action executed: ${actionSummary}`);
116
- if (actionCode) {
117
- actionsSummary.push(`Generated code: ${actionCode}`);
118
- generatedCode += actionCode;
97
+ // We expect either a function call or a computer call in the response.
98
+ let toolCallOutput;
99
+ let executedActionSummary = "";
100
+ // We are assuming only one function call per response
101
+ const functionCall = functionCalls[0];
102
+ if (functionCall) {
103
+ const args = JSON.parse(functionCall.arguments);
104
+ const { actionSummary, actionCode } = await (0, computer_1.handlePageGoto)(page, args.url);
105
+ executedActionSummary = actionSummary;
106
+ actionsSummary.push(`Action executed: ${actionSummary}`);
107
+ if (actionCode) {
108
+ actionsSummary.push(`Generated code: ${actionCode}`);
109
+ generatedCode += actionCode;
110
+ }
111
+ toolCallOutput = {
112
+ type: "function_call_output",
113
+ call_id: functionCall.call_id,
114
+ output: `Navigating page to ${args.url}`,
115
+ };
116
+ }
117
+ else if (computerCalls.length >= 1) {
118
+ // We expect at most one computer call per response.
119
+ const computerCall = computerCalls[0];
120
+ const action = computerCall.action;
121
+ // Execute the action and take a screenshot
122
+ const { actionSummary, actionCode } = await (0, computer_1.handleModelAction)(page, action);
123
+ executedActionSummary = actionSummary;
124
+ actionsSummary.push(`Action executed: ${actionSummary}`);
125
+ if (actionCode) {
126
+ actionsSummary.push(`Generated code: ${actionCode}`);
127
+ generatedCode += actionCode;
128
+ }
129
+ else {
130
+ actionsSummary.push(`No code generated: Will rely on Playwright's ability to auto-wait or auto-scroll`);
131
+ }
132
+ // Allow time for changes to take effect.
133
+ await new Promise((resolve) => setTimeout(resolve, 1000));
134
+ const screenshotBytes = await (0, computer_1.getScreenshot)(page);
135
+ // Populate toolCallOutput
136
+ toolCallOutput = {
137
+ type: "computer_call_output",
138
+ call_id: computerCall.call_id,
139
+ output: {
140
+ type: "computer_screenshot",
141
+ image_url: `data:image/png;base64,${screenshotBytes}`,
142
+ },
143
+ acknowledged_safety_checks: computerCall.pending_safety_checks,
144
+ };
119
145
  }
120
146
  else {
121
- actionsSummary.push(`No code generated: Will rely on Playwright's ability to auto-wait or auto-scroll`);
147
+ throw new Error("No tool call found in response.");
122
148
  }
123
- // Allow time for changes to take effect.
124
- await new Promise((resolve) => setTimeout(resolve, 1000));
125
- const screenshotBytes = await (0, computer_1.getScreenshot)(page);
126
- // Send the screenshot back as a computer_call_output
127
- const computerCallSpan = iterationSpan?.span({
128
- name: "computer-call-output",
129
- input: { lastCallId, acknowledged_safety_checks: pendingSafetyChecks },
130
- });
131
149
  response = await (0, model_1.callComputerUseModel)({
132
150
  previousResponseId: response.id,
133
151
  input: [
152
+ toolCallOutput,
134
153
  {
135
- call_id: lastCallId,
136
- type: "computer_call_output",
137
- output: {
138
- type: "computer_screenshot",
139
- image_url: `data:image/png;base64,${screenshotBytes}`,
140
- },
141
- acknowledged_safety_checks: pendingSafetyChecks,
154
+ role: "user",
155
+ content: [
156
+ {
157
+ type: "input_text",
158
+ text: `Action executed: ${executedActionSummary || "None"}\nCurrent page URL: ${page.url()}`,
159
+ },
160
+ ],
142
161
  },
143
162
  ],
144
163
  screenWidth,
145
164
  screenHeight,
165
+ openAIClient,
146
166
  });
147
- computerCallSpan?.end({ output: response });
148
- iterationSpan?.end({ output: response });
149
167
  }
150
168
  if (!isTaskDone) {
151
169
  actionsSummary.push(`Max iteration limit hit: Task not done after ${maxIterations} iterations`);
@@ -1,8 +1,10 @@
1
- import { Response, ResponseInput } from "openai/resources/responses/responses.mjs";
2
- export declare function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, }: {
3
- input: ResponseInput;
1
+ import OpenAI from "openai";
2
+ import { Response, ResponseInputItem } from "openai/resources/responses/responses.mjs";
3
+ export declare function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, openAIClient, }: {
4
+ input: ResponseInputItem[];
4
5
  previousResponseId?: string;
5
6
  screenWidth: number;
6
7
  screenHeight: number;
8
+ openAIClient: OpenAI;
7
9
  }): Promise<Response>;
8
10
  //# sourceMappingURL=model.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"model.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/model.ts"],"names":[],"mappings":"AACA,OAAO,EACL,QAAQ,EACR,aAAa,EACd,MAAM,0CAA0C,CAAC;AAWlD,wBAAsB,oBAAoB,CAAC,EACzC,KAAK,EACL,kBAAkB,EAClB,WAAW,EACX,YAAY,GACb,EAAE;IACD,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CAAC,QAAQ,CAAC,CAqBpB"}
1
+ {"version":3,"file":"model.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/model.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,QAAQ,CAAC;AAC5B,OAAO,EAEL,QAAQ,EACR,iBAAiB,EAClB,MAAM,0CAA0C,CAAC;AA8BlD,wBAAsB,oBAAoB,CAAC,EACzC,KAAK,EACL,kBAAkB,EAClB,WAAW,EACX,YAAY,EACZ,YAAY,GACb,EAAE;IACD,KAAK,EAAE,iBAAiB,EAAE,CAAC;IAC3B,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CAAC,QAAQ,CAAC,CAuBpB"}
@@ -1,10 +1,6 @@
1
1
  "use strict";
2
- var __importDefault = (this && this.__importDefault) || function (mod) {
3
- return (mod && mod.__esModule) ? mod : { "default": mod };
4
- };
5
2
  Object.defineProperty(exports, "__esModule", { value: true });
6
3
  exports.callComputerUseModel = void 0;
7
- const openai_1 = __importDefault(require("openai"));
8
4
  const INSTRUCTIONS = `You will be asked to execute some actions in a browser context.
9
5
  Don't ask the user for confirmations - just execute the actions.
10
6
 
@@ -13,12 +9,30 @@ you click on the submit button -- even if it looks like a scary action.
13
9
 
14
10
  If you have been asked to retrieve text or verify something on the UI, then communicate
15
11
  that in your responses so that the user can see your thinking process in its entirety.`;
16
- async function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, }) {
17
- const openai = new openai_1.default();
18
- return await openai.responses.create({
12
+ const pageGotoTool = {
13
+ type: "function",
14
+ name: "page_goto",
15
+ description: "Navigate to a given URL (e.g. https://www.openai.com). Call this if you are looking at a blank page or a new page.",
16
+ parameters: {
17
+ type: "object",
18
+ properties: {
19
+ url: {
20
+ type: "string",
21
+ description: "The URL to navigate to",
22
+ },
23
+ },
24
+ additionalProperties: false,
25
+ required: ["url"],
26
+ },
27
+ strict: true,
28
+ };
29
+ async function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, openAIClient, }) {
30
+ const response = await openAIClient.responses.create({
19
31
  model: "computer-use-preview-2025-03-11",
20
32
  previous_response_id: previousResponseId,
33
+ parallel_tool_calls: false,
21
34
  tools: [
35
+ pageGotoTool,
22
36
  {
23
37
  type: "computer-preview",
24
38
  display_width: screenWidth,
@@ -34,5 +48,6 @@ async function callComputerUseModel({ input, previousResponseId, screenWidth, sc
34
48
  input,
35
49
  truncation: "auto",
36
50
  });
51
+ return response;
37
52
  }
38
53
  exports.callComputerUseModel = callComputerUseModel;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@empiricalrun/test-gen",
3
- "version": "0.53.3",
3
+ "version": "0.53.5",
4
4
  "publishConfig": {
5
5
  "registry": "https://registry.npmjs.org/",
6
6
  "access": "public"
@@ -68,7 +68,7 @@
68
68
  "tsx": "^4.16.2",
69
69
  "typescript": "^5.3.3",
70
70
  "zod": "^3.23.8",
71
- "@empiricalrun/llm": "^0.14.2",
71
+ "@empiricalrun/llm": "^0.14.4",
72
72
  "@empiricalrun/r2-uploader": "^0.3.8",
73
73
  "@empiricalrun/test-run": "^0.7.6"
74
74
  },