@empiricalrun/test-gen 0.53.2 → 0.53.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,21 @@
1
1
  # @empiricalrun/test-gen
2
2
 
3
+ ## 0.53.4
4
+
5
+ ### Patch Changes
6
+
7
+ - 1426372: fix: remove stray console.log
8
+ - 7efc3dc: feat: add page.goto to cua implementation + prompt edits
9
+ - Updated dependencies [7efc3dc]
10
+ - @empiricalrun/llm@0.14.3
11
+
12
+ ## 0.53.3
13
+
14
+ ### Patch Changes
15
+
16
+ - 094b9f7: feat: add tool call for commit and push changes from chat agent
17
+ - cc64ff1: feat: enable browser tool call to pick the right page to interact with
18
+
3
19
  ## 0.53.2
4
20
 
5
21
  ### Patch Changes
@@ -1 +1 @@
1
- {"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../../../src/agent/browsing/utils.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,KAAK,EAAe,QAAQ,EAAE,MAAM,4BAA4B,CAAC;AAIxE,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAClC,OAAO,EAAE,oBAAoB,EAAE,MAAM,iBAAiB,CAAC;AAsBvD,wBAAgB,QAAQ,CAAC,GAAG,EAAE,GAAG,GAAG,GAAG,IAAI,MAAM,CAKhD;AAED,wBAAgB,wBAAwB,CAAC,KAAK,EAAE,MAAM,EAAE,UAIvD;AAiFD,wBAAsB,yBAAyB,CAAC,EAC9C,YAAY,EACZ,YAAY,EACZ,cAAc,GACf,EAAE;IACD,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,MAAM,EAAE,CAAC;CAC1B,iBA0BA;AAED,wBAAsB,cAAc,CAAC,EACnC,YAAY,EACZ,cAAc,EACd,QAAQ,GACT,EAAE;IACD,YAAY,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,MAAM,EAAE,CAAC;IACzB,QAAQ,EAAE,MAAM,CAAC;CAClB,iBAoBA;AAED,wBAAsB,yBAAyB,CAAC,EAC9C,QAAQ,EACR,QAAQ,EACR,KAAK,GACN,EAAE;IACD,QAAQ,EAAE,QAAQ,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,WAAW,CAAC;CACrB,GAAG,OAAO,CAAC,MAAM,CAAC,CAyDlB;AAyBD,wBAAsB,wBAAwB,CAAC,IAAI,EAAE,IAAI,iBA2HxD;AAED;;;GAGG;AACH,wBAAsB,oBAAoB,CACxC,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,oBAAoB,CAAC,CAM/B;AAWD,wBAAsB,oBAAoB,CACxC,gBAAgB,EAAE,oBAAoB,GACrC,OAAO,CAAC,MAAM,EAAE,CAAC,CAQnB;AAED;;;;;GAKG;AACH,wBAAsB,iBAAiB,CACrC,YAAY,EAAE,MAAM,EACpB,gBAAgB,EAAE,oBAAoB,EACtC,gBAAgB,GAAE,MAAM,EAAU,GACjC,OAAO,CAAC,MAAM,CAAC,CA+CjB;AAED,qBAAa,eAAe;IACd,OAAO,CAAC,SAAS;gBAAT,SAAS,EAAE,MAAM;IACrC,OAAO,CAAC,aAAa,CAAqB;YAE5B,mBAAmB;YAUnB,gBAAgB;IAsBjB,OAAO;IAoBb,SAAS;CAKjB"}
1
+ {"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../../../src/agent/browsing/utils.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,KAAK,EAAe,QAAQ,EAAE,MAAM,4BAA4B,CAAC;AAIxE,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAClC,OAAO,EAAE,oBAAoB,EAAE,MAAM,iBAAiB,CAAC;AAsBvD,wBAAgB,QAAQ,CAAC,GAAG,EAAE,GAAG,GAAG,GAAG,IAAI,MAAM,CAKhD;AAED,wBAAgB,wBAAwB,CAAC,KAAK,EAAE,MAAM,EAAE,UAIvD;AAiFD,wBAAsB,yBAAyB,CAAC,EAC9C,YAAY,EACZ,YAAY,EACZ,cAAc,GACf,EAAE;IACD,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,MAAM,EAAE,CAAC;CAC1B,iBA2BA;AAED,wBAAsB,cAAc,CAAC,EACnC,YAAY,EACZ,cAAc,EACd,QAAQ,GACT,EAAE;IACD,YAAY,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,MAAM,EAAE,CAAC;IACzB,QAAQ,EAAE,MAAM,CAAC;CAClB,iBAoBA;AAED,wBAAsB,yBAAyB,CAAC,EAC9C,QAAQ,EACR,QAAQ,EACR,KAAK,GACN,EAAE;IACD,QAAQ,EAAE,QAAQ,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,WAAW,CAAC;CACrB,GAAG,OAAO,CAAC,MAAM,CAAC,CAyDlB;AAyBD,wBAAsB,wBAAwB,CAAC,IAAI,EAAE,IAAI,iBA2HxD;AAED;;;GAGG;AACH,wBAAsB,oBAAoB,CACxC,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,oBAAoB,CAAC,CAM/B;AAWD,wBAAsB,oBAAoB,CACxC,gBAAgB,EAAE,oBAAoB,GACrC,OAAO,CAAC,MAAM,EAAE,CAAC,CAQnB;AAED;;;;;GAKG;AACH,wBAAsB,iBAAiB,CACrC,YAAY,EAAE,MAAM,EACpB,gBAAgB,EAAE,oBAAoB,EACtC,gBAAgB,GAAE,MAAM,EAAU,GACjC,OAAO,CAAC,MAAM,CAAC,CA+CjB;AAED,qBAAa,eAAe;IACd,OAAO,CAAC,SAAS;gBAAT,SAAS,EAAE,MAAM;IACrC,OAAO,CAAC,aAAa,CAAqB;YAE5B,mBAAmB;YAUnB,gBAAgB;IAsBjB,OAAO;IAoBb,SAAS;CAKjB"}
@@ -90,15 +90,16 @@ async function prepareFileForUpdateScenario({ testCase, specPath, trace, }) {
90
90
  }
91
91
  async function replaceTodoWithCreateTest({ testFilePath, testCaseName, testCaseSuites, }) {
92
92
  // This method is an alternative to prepareFileForUpdateScenario
93
- // TODO: Does not support multiple pages, scoped variables, updates in POM files
93
+ // TODO: Does not support scoped variables and updates in POM files
94
94
  const fileContent = await fs_extra_1.default.readFile(testFilePath, "utf-8");
95
- const todoRegex = /\/\/ TODO\(agent\): (.*)/;
95
+ const todoRegex = /\/\/ TODO\(agent(?:\s+on\s+(\w+))?\):\s*(.*)/;
96
96
  const todoMatch = fileContent.match(todoRegex);
97
97
  if (!todoMatch) {
98
- throw new Error(`No "// TODO(agent):" comment found in file: ${testFilePath}`);
98
+ throw new Error(`No "// TODO(agent):" or "// TODO(agent on pageName):" comment found in file: ${testFilePath}`);
99
99
  }
100
- // TODO: figure out correct page variable name
101
- await fs_extra_1.default.writeFile(testFilePath, fileContent.replace(todoRegex, (_, todoText) => `await createTest("${todoText.replace(/"/g, '\\"')}", page);`));
100
+ const [, pageVarName] = todoMatch;
101
+ const pageVariable = pageVarName || "page"; // Default to "page" if not specified
102
+ await fs_extra_1.default.writeFile(testFilePath, fileContent.replace(todoRegex, (_, __, todoText) => `await createTest("${todoText.replace(/"/g, '\\"')}", ${pageVariable});`));
102
103
  await addImportForCreateTest(testFilePath);
103
104
  await markTestAsOnly({
104
105
  testCaseName,
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/chat/index.ts"],"names":[],"mappings":"AAyFA,wBAAsB,SAAS,CAAC,EAC9B,aAA4C,EAC5C,mBAA2B,EAC3B,oBAAoB,GACrB,EAAE;IACD,aAAa,CAAC,EACV,4BAA4B,GAC5B,4BAA4B,GAC5B,8BAA8B,CAAC;IACnC,mBAAmB,CAAC,EAAE,OAAO,CAAC;IAC9B,oBAAoB,CAAC,EAAE,MAAM,CAAC;CAC/B,mBAyFA"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/chat/index.ts"],"names":[],"mappings":"AA2FA,wBAAsB,SAAS,CAAC,EAC9B,aAA4C,EAC5C,mBAA2B,EAC3B,oBAAoB,GACrB,EAAE;IACD,aAAa,CAAC,EACV,4BAA4B,GAC5B,4BAA4B,GAC5B,8BAA8B,CAAC;IACnC,mBAAmB,CAAC,EAAE,OAAO,CAAC;IAC9B,oBAAoB,CAAC,EAAE,MAAM,CAAC;CAC/B,mBAyFA"}
@@ -5,6 +5,7 @@ const chat_1 = require("@empiricalrun/llm/chat");
5
5
  const picocolors_1 = require("picocolors");
6
6
  const web_1 = require("../../bin/utils/platform/web");
7
7
  const human_in_the_loop_1 = require("../../human-in-the-loop");
8
+ const commit_and_create_pr_1 = require("../../tools/commit-and-create-pr");
8
9
  const diagnosis_fetcher_1 = require("../../tools/diagnosis-fetcher");
9
10
  const grep_1 = require("../../tools/grep");
10
11
  const test_gen_browser_1 = require("../../tools/test-gen-browser");
@@ -18,6 +19,7 @@ function getTools(selectedModel) {
18
19
  test_run_fetcher_1.fetchTestRunReportTool,
19
20
  diagnosis_fetcher_1.fetchDiagnosisReportTool,
20
21
  test_gen_browser_1.generateTestWithBrowserAgent,
22
+ commit_and_create_pr_1.commitAndPushChangesTool,
21
23
  ];
22
24
  if (selectedModel.startsWith("gemini")) {
23
25
  // Claude will have its own built-in text editor tools
@@ -2,6 +2,10 @@ import { ResponseComputerToolCall } from "openai/resources/responses/responses.m
2
2
  import type { Page } from "playwright";
3
3
  type ComputerAction = ResponseComputerToolCall.Click | ResponseComputerToolCall.DoubleClick | ResponseComputerToolCall.Drag | ResponseComputerToolCall.Keypress | ResponseComputerToolCall.Move | ResponseComputerToolCall.Screenshot | ResponseComputerToolCall.Scroll | ResponseComputerToolCall.Type | ResponseComputerToolCall.Wait;
4
4
  export declare function getScreenshot(page: Page): Promise<string>;
5
+ export declare function handlePageGoto(page: Page, url: string): Promise<{
6
+ actionSummary: string;
7
+ actionCode: string;
8
+ }>;
5
9
  export declare function handleModelAction(page: Page, action: ComputerAction): Promise<{
6
10
  actionSummary: string;
7
11
  actionCode: string;
@@ -1 +1 @@
1
- {"version":3,"file":"computer.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/computer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,wBAAwB,EAAE,MAAM,0CAA0C,CAAC;AACpF,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC,KAAK,cAAc,GACf,wBAAwB,CAAC,KAAK,GAC9B,wBAAwB,CAAC,WAAW,GACpC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,QAAQ,GACjC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,UAAU,GACnC,wBAAwB,CAAC,MAAM,GAC/B,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,IAAI,CAAC;AAElC,wBAAsB,aAAa,CAAC,IAAI,EAAE,IAAI,mBAG7C;AAgCD,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAqID"}
1
+ {"version":3,"file":"computer.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/computer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,wBAAwB,EAAE,MAAM,0CAA0C,CAAC;AACpF,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC,KAAK,cAAc,GACf,wBAAwB,CAAC,KAAK,GAC9B,wBAAwB,CAAC,WAAW,GACpC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,QAAQ,GACjC,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,UAAU,GACnC,wBAAwB,CAAC,MAAM,GAC/B,wBAAwB,CAAC,IAAI,GAC7B,wBAAwB,CAAC,IAAI,CAAC;AAElC,wBAAsB,aAAa,CAAC,IAAI,EAAE,IAAI,mBAG7C;AAgCD,wBAAsB,cAAc,CAClC,IAAI,EAAE,IAAI,EACV,GAAG,EAAE,MAAM,GACV,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAMD;AAED,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,IAAI,EACV,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC;IACT,aAAa,EAAE,MAAM,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACpB,CAAC,CAqID"}
@@ -1,6 +1,6 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.handleModelAction = exports.getScreenshot = void 0;
3
+ exports.handleModelAction = exports.handlePageGoto = exports.getScreenshot = void 0;
4
4
  async function getScreenshot(page) {
5
5
  const screenshotBytes = await page.screenshot();
6
6
  return Buffer.from(screenshotBytes).toString("base64");
@@ -35,6 +35,14 @@ const CUA_KEY_TO_PLAYWRIGHT_KEY = {
35
35
  tab: "Tab",
36
36
  win: "Meta",
37
37
  };
38
+ async function handlePageGoto(page, url) {
39
+ await page.goto(url);
40
+ return {
41
+ actionSummary: `Navigated page to ${url}`,
42
+ actionCode: `await page.goto("${url}");\n`,
43
+ };
44
+ }
45
+ exports.handlePageGoto = handlePageGoto;
38
46
  async function handleModelAction(page, action) {
39
47
  const actionType = action.type;
40
48
  let actionCode = "";
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/index.ts"],"names":[],"mappings":"AAOA,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAOlC,wBAAsB,sBAAsB,CAAC,IAAI,EAAE,IAAI,iBAoBtD;AAED,wBAAsB,+BAA+B,CAAC,EACpD,IAAI,EACJ,IAAI,GACL,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;CACd,GAAG,OAAO,CAAC;IACV,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;CACxB,CAAC,CA2JD"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/index.ts"],"names":[],"mappings":"AASA,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAOlC,wBAAsB,sBAAsB,CAAC,IAAI,EAAE,IAAI,iBAoBtD;AAED,wBAAsB,+BAA+B,CAAC,EACpD,IAAI,EACJ,IAAI,GACL,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;CACd,GAAG,OAAO,CAAC;IACV,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;CACxB,CAAC,CAmMD"}
@@ -58,7 +58,7 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
58
58
  content: [
59
59
  {
60
60
  type: "input_text",
61
- text: task,
61
+ text: `Task to execute: ${task}\n\nCurrent page URL: ${page.url()}`,
62
62
  },
63
63
  {
64
64
  type: "input_image",
@@ -85,7 +85,8 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
85
85
  input: { response },
86
86
  });
87
87
  const computerCalls = response.output.filter((item) => item.type === "computer_call");
88
- if (computerCalls.length === 0) {
88
+ const functionCalls = response.output.filter((item) => item.type === "function_call");
89
+ if (computerCalls.length === 0 && functionCalls.length === 0) {
89
90
  const assistantOutput = response.output.find((item) => item.type === "message");
90
91
  if (assistantOutput) {
91
92
  const content = assistantOutput.content.find((item) => item.type === "output_text");
@@ -105,46 +106,75 @@ async function createTestUsingComputerUseAgent({ page, task, }) {
105
106
  actionsSummary.push(`Action reasoning: ${summaryText}`);
106
107
  }
107
108
  }
108
- // We expect at most one computer call per response.
109
- const computerCall = computerCalls[0];
110
- const lastCallId = computerCall.call_id;
111
- const action = computerCall.action;
112
- const pendingSafetyChecks = computerCall.pending_safety_checks;
113
- // Execute the action and take a screenshot
114
- const { actionSummary, actionCode } = await (0, computer_1.handleModelAction)(page, action);
115
- actionsSummary.push(`Action executed: ${actionSummary}`);
116
- if (actionCode) {
117
- actionsSummary.push(`Generated code: ${actionCode}`);
118
- generatedCode += actionCode;
109
+ // We expect either a function call or a computer call in the response.
110
+ let toolCallOutput;
111
+ let executedActionSummary = "";
112
+ // We are assuming only one function call per response
113
+ const functionCall = functionCalls[0];
114
+ if (functionCall) {
115
+ const args = JSON.parse(functionCall.arguments);
116
+ const { actionSummary, actionCode } = await (0, computer_1.handlePageGoto)(page, args.url);
117
+ executedActionSummary = actionSummary;
118
+ actionsSummary.push(`Action executed: ${actionSummary}`);
119
+ if (actionCode) {
120
+ actionsSummary.push(`Generated code: ${actionCode}`);
121
+ generatedCode += actionCode;
122
+ }
123
+ toolCallOutput = {
124
+ type: "function_call_output",
125
+ call_id: functionCall.call_id,
126
+ output: `Navigating page to ${args.url}`,
127
+ };
128
+ }
129
+ else if (computerCalls.length >= 1) {
130
+ // We expect at most one computer call per response.
131
+ const computerCall = computerCalls[0];
132
+ const action = computerCall.action;
133
+ // Execute the action and take a screenshot
134
+ const { actionSummary, actionCode } = await (0, computer_1.handleModelAction)(page, action);
135
+ executedActionSummary = actionSummary;
136
+ actionsSummary.push(`Action executed: ${actionSummary}`);
137
+ if (actionCode) {
138
+ actionsSummary.push(`Generated code: ${actionCode}`);
139
+ generatedCode += actionCode;
140
+ }
141
+ else {
142
+ actionsSummary.push(`No code generated: Will rely on Playwright's ability to auto-wait or auto-scroll`);
143
+ }
144
+ // Allow time for changes to take effect.
145
+ await new Promise((resolve) => setTimeout(resolve, 1000));
146
+ const screenshotBytes = await (0, computer_1.getScreenshot)(page);
147
+ // Populate toolCallOutput
148
+ toolCallOutput = {
149
+ type: "computer_call_output",
150
+ call_id: computerCall.call_id,
151
+ output: {
152
+ type: "computer_screenshot",
153
+ image_url: `data:image/png;base64,${screenshotBytes}`,
154
+ },
155
+ acknowledged_safety_checks: computerCall.pending_safety_checks,
156
+ };
119
157
  }
120
158
  else {
121
- actionsSummary.push(`No code generated: Will rely on Playwright's ability to auto-wait or auto-scroll`);
159
+ throw new Error("No tool call found in response.");
122
160
  }
123
- // Allow time for changes to take effect.
124
- await new Promise((resolve) => setTimeout(resolve, 1000));
125
- const screenshotBytes = await (0, computer_1.getScreenshot)(page);
126
- // Send the screenshot back as a computer_call_output
127
- const computerCallSpan = iterationSpan?.span({
128
- name: "computer-call-output",
129
- input: { lastCallId, acknowledged_safety_checks: pendingSafetyChecks },
130
- });
131
161
  response = await (0, model_1.callComputerUseModel)({
132
162
  previousResponseId: response.id,
133
163
  input: [
164
+ toolCallOutput,
134
165
  {
135
- call_id: lastCallId,
136
- type: "computer_call_output",
137
- output: {
138
- type: "computer_screenshot",
139
- image_url: `data:image/png;base64,${screenshotBytes}`,
140
- },
141
- acknowledged_safety_checks: pendingSafetyChecks,
166
+ role: "user",
167
+ content: [
168
+ {
169
+ type: "input_text",
170
+ text: `Action executed: ${executedActionSummary || "None"}\nCurrent page URL: ${page.url()}`,
171
+ },
172
+ ],
142
173
  },
143
174
  ],
144
175
  screenWidth,
145
176
  screenHeight,
146
177
  });
147
- computerCallSpan?.end({ output: response });
148
178
  iterationSpan?.end({ output: response });
149
179
  }
150
180
  if (!isTaskDone) {
@@ -1,6 +1,6 @@
1
- import { Response, ResponseInput } from "openai/resources/responses/responses.mjs";
1
+ import { Response, ResponseInputItem } from "openai/resources/responses/responses.mjs";
2
2
  export declare function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, }: {
3
- input: ResponseInput;
3
+ input: ResponseInputItem[];
4
4
  previousResponseId?: string;
5
5
  screenWidth: number;
6
6
  screenHeight: number;
@@ -1 +1 @@
1
- {"version":3,"file":"model.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/model.ts"],"names":[],"mappings":"AACA,OAAO,EACL,QAAQ,EACR,aAAa,EACd,MAAM,0CAA0C,CAAC;AAWlD,wBAAsB,oBAAoB,CAAC,EACzC,KAAK,EACL,kBAAkB,EAClB,WAAW,EACX,YAAY,GACb,EAAE;IACD,KAAK,EAAE,aAAa,CAAC;IACrB,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CAAC,QAAQ,CAAC,CAqBpB"}
1
+ {"version":3,"file":"model.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/model.ts"],"names":[],"mappings":"AACA,OAAO,EAEL,QAAQ,EACR,iBAAiB,EAClB,MAAM,0CAA0C,CAAC;AA8BlD,wBAAsB,oBAAoB,CAAC,EACzC,KAAK,EACL,kBAAkB,EAClB,WAAW,EACX,YAAY,GACb,EAAE;IACD,KAAK,EAAE,iBAAiB,EAAE,CAAC;IAC3B,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CAAC,QAAQ,CAAC,CAuBpB"}
@@ -13,12 +13,31 @@ you click on the submit button -- even if it looks like a scary action.
13
13
 
14
14
  If you have been asked to retrieve text or verify something on the UI, then communicate
15
15
  that in your responses so that the user can see your thinking process in its entirety.`;
16
+ const pageGotoTool = {
17
+ type: "function",
18
+ name: "page_goto",
19
+ description: "Navigate to a given URL (e.g. https://www.openai.com). Call this if you are looking at a blank page or a new page.",
20
+ parameters: {
21
+ type: "object",
22
+ properties: {
23
+ url: {
24
+ type: "string",
25
+ description: "The URL to navigate to",
26
+ },
27
+ },
28
+ additionalProperties: false,
29
+ required: ["url"],
30
+ },
31
+ strict: true,
32
+ };
16
33
  async function callComputerUseModel({ input, previousResponseId, screenWidth, screenHeight, }) {
17
34
  const openai = new openai_1.default();
18
35
  return await openai.responses.create({
19
36
  model: "computer-use-preview-2025-03-11",
20
37
  previous_response_id: previousResponseId,
38
+ parallel_tool_calls: false,
21
39
  tools: [
40
+ pageGotoTool,
22
41
  {
23
42
  type: "computer-preview",
24
43
  display_width: screenWidth,
@@ -71,7 +71,7 @@ function printBanner() {
71
71
  --"-"-`;
72
72
  const version = require("../../../package.json").version;
73
73
  const logLine1 = `Running test-gen v${version}`;
74
- const logLine2 = `from ${__dirname}`;
74
+ const logLine2 = `from ${__dirname.split("/bin/utils")[0]}`;
75
75
  // Process ASCII art
76
76
  const asciiLines = asciiArtRaw
77
77
  .split("\n")
@@ -0,0 +1,3 @@
1
+ import type { Tool } from "@empiricalrun/llm/chat";
2
+ export declare const commitAndPushChangesTool: Tool;
3
+ //# sourceMappingURL=commit-and-create-pr.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"commit-and-create-pr.d.ts","sourceRoot":"","sources":["../../src/tools/commit-and-create-pr.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,wBAAwB,CAAC;AAyBnD,eAAO,MAAM,wBAAwB,EAAE,IAwFtC,CAAC"}
@@ -0,0 +1,102 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.commitAndPushChangesTool = void 0;
7
+ const child_process_1 = require("child_process");
8
+ const crypto_1 = __importDefault(require("crypto"));
9
+ const zod_1 = require("zod");
10
+ const utils_1 = require("./utils");
11
+ const GIT_USER_NAME = "empiricalrun[bot]";
12
+ const GIT_USER_EMAIL = "180257021+empiricalrun[bot]@users.noreply.github.com";
13
+ const CommitAndPushChangesSchema = zod_1.z.object({
14
+ commitMessage: zod_1.z
15
+ .string()
16
+ .describe("A short message to use for the commit. Should not be more than 8 words. Should follow conventional commit format."),
17
+ });
18
+ exports.commitAndPushChangesTool = {
19
+ schema: {
20
+ name: "commitAndPushChanges",
21
+ description: `Creates a commit with all modified files and pushes them to the current branch.
22
+ If currently on main branch, creates a new branch with a random name.
23
+ If the current branch already has an open PR, commits and pushes changes to that PR.
24
+ Uses the empiricalrun[bot] credentials for git operations.
25
+ Returns the URL of the created or updated pull request.`,
26
+ parameters: CommitAndPushChangesSchema,
27
+ },
28
+ execute: async (input) => {
29
+ try {
30
+ const { commitMessage } = input;
31
+ const currentBranch = (0, child_process_1.execSync)("git rev-parse --abbrev-ref HEAD")
32
+ .toString()
33
+ .trim();
34
+ let branchName = currentBranch;
35
+ if (currentBranch === "main") {
36
+ // If on main, create a new branch
37
+ const randomId = crypto_1.default.randomUUID().substring(0, 8);
38
+ branchName = `branch-${randomId}`;
39
+ (0, child_process_1.execSync)(`git checkout -b ${branchName}`);
40
+ }
41
+ const modifiedFiles = (0, child_process_1.execSync)("git status --porcelain")
42
+ .toString()
43
+ .split("\n")
44
+ .filter((line) => line && !line.includes(".bak"))
45
+ .map((line) => line.substring(3)); // Remove status prefix
46
+ if (modifiedFiles.length === 0) {
47
+ return {
48
+ isError: true,
49
+ result: "No modified files to commit",
50
+ };
51
+ }
52
+ for (const file of modifiedFiles) {
53
+ (0, child_process_1.execSync)(`git add "${file}"`);
54
+ }
55
+ // Use -c flag to set config just for this commit
56
+ (0, child_process_1.execSync)(`git -c user.name="${GIT_USER_NAME}" -c user.email="${GIT_USER_EMAIL}" commit -m "${commitMessage} [skip ci]"`);
57
+ const repoUrl = (0, child_process_1.execSync)("git config --get remote.origin.url")
58
+ .toString()
59
+ .trim();
60
+ const [owner, repo] = repoUrl
61
+ .replace("https://github.com/", "")
62
+ .replace(".git", "")
63
+ .split("/");
64
+ const existingPRs = (await (0, utils_1.callGitHubProxy)({
65
+ method: "GET",
66
+ url: `https://api.github.com/repos/${owner}/${repo}/pulls`,
67
+ body: {
68
+ head: `${owner}:${branchName}`,
69
+ state: "open",
70
+ },
71
+ }));
72
+ (0, child_process_1.execSync)(`git push origin ${branchName} --set-upstream`);
73
+ const existingPR = existingPRs?.find((pr) => pr.head.ref === branchName);
74
+ if (existingPR) {
75
+ return {
76
+ isError: false,
77
+ result: `Committed and pushed changes to existing PR: ${existingPR.html_url}`,
78
+ };
79
+ }
80
+ const pr = (await (0, utils_1.callGitHubProxy)({
81
+ method: "POST",
82
+ url: `https://api.github.com/repos/${owner}/${repo}/pulls`,
83
+ body: {
84
+ title: commitMessage,
85
+ head: branchName,
86
+ base: "main",
87
+ body: "Created via CommitAndPushChanges tool",
88
+ },
89
+ }));
90
+ return {
91
+ isError: false,
92
+ result: `Committed and pushed changes to new PR: ${pr.html_url}`,
93
+ };
94
+ }
95
+ catch (error) {
96
+ return {
97
+ isError: true,
98
+ result: `Failed to commit and push changes: ${error instanceof Error ? error.message : String(error)}`,
99
+ };
100
+ }
101
+ },
102
+ };
@@ -1 +1 @@
1
- {"version":3,"file":"diagnosis-fetcher.d.ts","sourceRoot":"","sources":["../../src/tools/diagnosis-fetcher.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,wBAAwB,CAAC;AAanD,eAAO,MAAM,wBAAwB,EAAE,IAgFtC,CAAC"}
1
+ {"version":3,"file":"diagnosis-fetcher.d.ts","sourceRoot":"","sources":["../../src/tools/diagnosis-fetcher.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,wBAAwB,CAAC;AAenD,eAAO,MAAM,wBAAwB,EAAE,IAgFtC,CAAC"}
@@ -7,6 +7,7 @@ exports.fetchDiagnosisReportTool = void 0;
7
7
  const promises_1 = __importDefault(require("fs/promises"));
8
8
  const path_1 = __importDefault(require("path"));
9
9
  const zod_1 = require("zod");
10
+ const utils_1 = require("./utils");
10
11
  const DiagnosisSchema = zod_1.z.object({
11
12
  diagnosisUrl: zod_1.z
12
13
  .string()
@@ -23,21 +24,23 @@ exports.fetchDiagnosisReportTool = {
23
24
  // Extract the slug from the URL - it's the part after the last '--'
24
25
  const slug = diagnosisUrl.split("--").pop();
25
26
  if (!slug) {
26
- throw new Error("Invalid diagnosis URL - could not extract slug");
27
+ return {
28
+ isError: true,
29
+ result: "Invalid diagnosis URL - could not extract slug",
30
+ };
31
+ }
32
+ let data = null;
33
+ try {
34
+ data = await (0, utils_1.makeDashboardRequest)({
35
+ path: `/api/diagnosis/${slug}/detailed`,
36
+ });
27
37
  }
28
- const response = await fetch(`https://dash.empirical.run/api/diagnosis/${slug}/detailed`, {
29
- method: "GET",
30
- headers: {
31
- Authorization: "weQPMWKT",
32
- },
33
- });
34
- if (!response.ok) {
38
+ catch (error) {
35
39
  return {
36
- result: `Failed to fetch diagnosis details: ${response.statusText}`,
37
40
  isError: true,
41
+ result: `Failed to fetch diagnosis details: ${error instanceof Error ? error.message : String(error)}`,
38
42
  };
39
43
  }
40
- const data = await response.json();
41
44
  const { test_case, diagnosis } = data.data;
42
45
  const project = diagnosis?.test_project || "unknown";
43
46
  const sourceContext = await promises_1.default.readFile(path_1.default.join("tests", test_case.file_path), "utf-8");
@@ -1 +1 @@
1
- {"version":3,"file":"test-gen-browser.d.ts","sourceRoot":"","sources":["../../src/tools/test-gen-browser.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,wBAAwB,CAAC;AAwEnD,eAAO,MAAM,4BAA4B,EAAE,IA0E1C,CAAC"}
1
+ {"version":3,"file":"test-gen-browser.d.ts","sourceRoot":"","sources":["../../src/tools/test-gen-browser.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,wBAAwB,CAAC;AA8EnD,eAAO,MAAM,4BAA4B,EAAE,IA0E1C,CAAC"}
@@ -29,16 +29,22 @@ and generate Playwright code for that actions. This is a useful tool when the mo
29
29
  locator/selector for an element on the page.
30
30
 
31
31
  IMPORTANT: Before you invoke this tool, you need to ensure that the test code is correctly prepared for this
32
- agent. Preparation involves adding a TODO comment that describes the change that needs to be made. A good
33
- comment calls out the element and browser interactions it must take. The TODO comment also has (agent) next to it, to
34
- clearly label that the change is for the agent to make.
32
+ agent. Preparation involves adding a TODO comment that describes the change that needs to be made, and the page
33
+ variable name where the actions must be performed. The content of the TODO comment calls out the element and browser
34
+ interactions it must take. The TODO comment also has (agent on pageName) next to it, to clearly label that the change
35
+ is for the agent to make on the given page (pageName in this case).
36
+
37
+ To choose the page variable name, go through the test code and find available page variables. If you are replacing some
38
+ existing test code, use the same page variable name as in the existing test code. If you are adding steps to the test,
39
+ use the page variable name that is appropriate for the new steps. The page variable represents the browser page (or tab) that
40
+ the agent is supposed to interact with.
35
41
 
36
42
  For example, this is a good TODO comment:
37
43
 
38
44
  \`\`\`
39
45
  test("Example test code", async ({ page }) => {
40
46
  await page.goto("https://example.com");
41
- // TODO(agent): Click on the login button
47
+ // TODO(agent on page): Click on the login button
42
48
  });
43
49
  \`\`\`
44
50
 
@@ -53,7 +59,7 @@ For example, this is invalid:
53
59
  \`\`\`
54
60
  await extPage
55
61
  .getByTestId("virtuoso-item-list")
56
- // TODO(agent): Click on the STARS button
62
+ // TODO(agent on extPage): Click on the STARS button
57
63
  .getByText("STARS", { exact: true })
58
64
  .click();
59
65
  // This is invalid, because the TODO is in the middle of a multi-line statement
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/tools/test-run-fetcher/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,wBAAwB,CAAC;AAanD,wBAAgB,0BAA0B,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAOnE;AAED,eAAO,MAAM,sBAAsB,EAAE,IA4HpC,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/tools/test-run-fetcher/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,wBAAwB,CAAC;AAcnD,wBAAgB,0BAA0B,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAOnE;AAED,eAAO,MAAM,sBAAsB,EAAE,IAwHpC,CAAC"}
@@ -2,6 +2,7 @@
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.fetchTestRunReportTool = exports.extractPathAfterSourceRepo = void 0;
4
4
  const zod_1 = require("zod");
5
+ const utils_1 = require("../utils");
5
6
  const TestRunSchema = zod_1.z.object({
6
7
  testRunUrl: zod_1.z
7
8
  .string()
@@ -32,22 +33,23 @@ exports.fetchTestRunReportTool = {
32
33
  const runId = urlParts.pop(); // Last part is the run ID
33
34
  const repoName = urlParts[urlParts.length - 2]; // Second to last part is the repo name
34
35
  if (!runId || !repoName) {
35
- throw new Error("Invalid test run URL - could not extract run ID or repo name");
36
+ return {
37
+ isError: true,
38
+ result: "Invalid test run URL - could not extract run ID or repo name",
39
+ };
40
+ }
41
+ let data = null;
42
+ try {
43
+ data = await (0, utils_1.makeDashboardRequest)({
44
+ path: `/api/test-runs/${runId}?repo_name=${repoName}`,
45
+ });
36
46
  }
37
- // Make the API call to fetch test run details
38
- const response = await fetch(`https://dash.empirical.run/api/test-runs/${runId}?repo_name=${repoName}`, {
39
- method: "GET",
40
- headers: {
41
- Authorization: "weQPMWKT",
42
- },
43
- });
44
- if (!response.ok) {
47
+ catch (error) {
45
48
  return {
46
- result: `Failed to fetch test run details: ${response.statusText}`,
47
49
  isError: true,
50
+ result: `Failed to fetch test run details: ${error instanceof Error ? error.message : String(error)}`,
48
51
  };
49
52
  }
50
- const data = (await response.json());
51
53
  // To efficiently use input_tokens, we
52
54
  // 1. Truncate stack trace to last 300 characters
53
55
  // 2. Remove request/response headers from network metadata
@@ -0,0 +1,11 @@
1
+ export declare function makeDashboardRequest<T>({ path, method, body, }: {
2
+ path: string;
3
+ method?: string;
4
+ body?: any;
5
+ }): Promise<T>;
6
+ export declare function callGitHubProxy({ method, url, body, }: {
7
+ method: string;
8
+ url: string;
9
+ body?: any;
10
+ }): Promise<unknown>;
11
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/tools/utils/index.ts"],"names":[],"mappings":"AAAA,wBAAsB,oBAAoB,CAAC,CAAC,EAAE,EAC5C,IAAI,EACJ,MAAc,EACd,IAAI,GACL,EAAE;IACD,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,GAAG,CAAC;CACZ,GAAG,OAAO,CAAC,CAAC,CAAC,CAoBb;AAED,wBAAsB,eAAe,CAAC,EACpC,MAAM,EACN,GAAG,EACH,IAAI,GACL,EAAE;IACD,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,CAAC,EAAE,GAAG,CAAC;CACZ,oBAWA"}
@@ -0,0 +1,36 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.callGitHubProxy = exports.makeDashboardRequest = void 0;
4
+ async function makeDashboardRequest({ path, method = "GET", body, }) {
5
+ const requestHeaders = {
6
+ "Content-Type": "application/json",
7
+ // TODO: Move to env variable for authentication
8
+ Authorization: "weQPMWKT",
9
+ "User-Agent": "empiricalrun/test-gen",
10
+ };
11
+ const baseUrl = "https://dash.empirical.run";
12
+ const response = await fetch(`${baseUrl}${path}`, {
13
+ method,
14
+ headers: requestHeaders,
15
+ ...(body && { body: JSON.stringify(body) }),
16
+ });
17
+ if (!response.ok) {
18
+ const errorBody = await response.text();
19
+ throw new Error(`API request failed for ${method} ${path} (Status: ${response.status}). Body: ${errorBody}`);
20
+ }
21
+ return await response.json();
22
+ }
23
+ exports.makeDashboardRequest = makeDashboardRequest;
24
+ async function callGitHubProxy({ method, url, body, }) {
25
+ const githubApiPath = url.replace("https://api.github.com", "");
26
+ return makeDashboardRequest({
27
+ path: "/api/github/proxy",
28
+ method: "POST",
29
+ body: {
30
+ method,
31
+ url: githubApiPath,
32
+ body,
33
+ },
34
+ });
35
+ }
36
+ exports.callGitHubProxy = callGitHubProxy;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@empiricalrun/test-gen",
3
- "version": "0.53.2",
3
+ "version": "0.53.4",
4
4
  "publishConfig": {
5
5
  "registry": "https://registry.npmjs.org/",
6
6
  "access": "public"
@@ -68,7 +68,7 @@
68
68
  "tsx": "^4.16.2",
69
69
  "typescript": "^5.3.3",
70
70
  "zod": "^3.23.8",
71
- "@empiricalrun/llm": "^0.14.2",
71
+ "@empiricalrun/llm": "^0.14.3",
72
72
  "@empiricalrun/r2-uploader": "^0.3.8",
73
73
  "@empiricalrun/test-run": "^0.7.6"
74
74
  },