@empiricalrun/test-gen 0.46.8 → 0.46.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/dist/actions/goto.d.ts.map +1 -1
- package/dist/actions/goto.js +1 -1
- package/dist/agent/browsing/index.d.ts +1 -1
- package/dist/agent/browsing/index.d.ts.map +1 -1
- package/dist/agent/browsing/index.js +12 -16
- package/dist/agent/master/browser-tests/skills.spec.js +4 -2
- package/dist/agent/master/execute-browser-action.d.ts +5 -0
- package/dist/agent/master/execute-browser-action.d.ts.map +1 -1
- package/dist/agent/master/execute-browser-action.js +1 -0
- package/dist/agent/master/run.d.ts.map +1 -1
- package/dist/agent/master/run.js +8 -3
- package/dist/agent/planner/run-time-planner.d.ts +4 -2
- package/dist/agent/planner/run-time-planner.d.ts.map +1 -1
- package/dist/agent/planner/run-time-planner.js +17 -7
- package/package.json +3 -3
- package/dist/agent/browsing/o1-completion.d.ts +0 -8
- package/dist/agent/browsing/o1-completion.d.ts.map +0 -1
- package/dist/agent/browsing/o1-completion.js +0 -72
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,19 @@
|
|
|
1
1
|
# @empiricalrun/test-gen
|
|
2
2
|
|
|
3
|
+
## 0.46.10
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- b3ca74f: fix: bubble up browsing agent execution feedback for runtime planner
|
|
8
|
+
- b484292: fix: pages summary for runtime planner works without scoped vars
|
|
9
|
+
|
|
10
|
+
## 0.46.9
|
|
11
|
+
|
|
12
|
+
### Patch Changes
|
|
13
|
+
|
|
14
|
+
- 6914e32: fix: use o3-mini to improve browsing agent accuracy
|
|
15
|
+
- 22ef805: fix: improve accuracy for multiple pages in runtime planner
|
|
16
|
+
|
|
3
17
|
## 0.46.8
|
|
4
18
|
|
|
5
19
|
### Patch Changes
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"goto.d.ts","sourceRoot":"","sources":["../../src/actions/goto.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,cAAc,EAAE,yBAAyB,EAAE,MAAM,UAAU,CAAC;AAGrE,eAAO,MAAM,2BAA2B,cAAc,CAAC;AAEvD,eAAO,MAAM,mBAAmB,EAAE,yBAAyB,CAAC,cAAc,
|
|
1
|
+
{"version":3,"file":"goto.d.ts","sourceRoot":"","sources":["../../src/actions/goto.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,cAAc,EAAE,yBAAyB,EAAE,MAAM,UAAU,CAAC;AAGrE,eAAO,MAAM,2BAA2B,cAAc,CAAC;AAEvD,eAAO,MAAM,mBAAmB,EAAE,yBAAyB,CAAC,cAAc,CA0CzE,CAAC"}
|
package/dist/actions/goto.js
CHANGED
|
@@ -35,7 +35,7 @@ const gotoActionGenerator = (page) => {
|
|
|
35
35
|
},
|
|
36
36
|
url: {
|
|
37
37
|
type: "string",
|
|
38
|
-
description: "URL to navigate page to",
|
|
38
|
+
description: "URL to navigate page to. Must be prefixed with http or https (preferably https)",
|
|
39
39
|
},
|
|
40
40
|
},
|
|
41
41
|
required: ["reason", "url"],
|
|
@@ -9,7 +9,7 @@ export declare function executeTaskUsingBrowsingAgent({ action, page, actions, l
|
|
|
9
9
|
action: string;
|
|
10
10
|
page: Page;
|
|
11
11
|
actions: PlaywrightActions;
|
|
12
|
-
llm
|
|
12
|
+
llm?: LLM;
|
|
13
13
|
trace?: TraceClient;
|
|
14
14
|
}): Promise<BrowserAgentAction | undefined>;
|
|
15
15
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/browsing/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAiB,GAAG,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACpE,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAElC,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/browsing/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAiB,GAAG,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACpE,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAElC,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAOlD,MAAM,MAAM,kBAAkB,GAAG;IAC/B,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,MAAM,GAAG,SAAS,CAAC;CAC1B,CAAC;AAEF,wBAAsB,6BAA6B,CAAC,EAClD,MAAM,EACN,IAAI,EACJ,OAAO,EACP,GAAG,EACH,KAAK,GACN,EAAE;IACD,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,IAAI,CAAC;IACX,OAAO,EAAE,iBAAiB,CAAC;IAC3B,GAAG,CAAC,EAAE,GAAG,CAAC;IACV,KAAK,CAAC,EAAE,WAAW,CAAC;CACrB,GAAG,OAAO,CAAC,kBAAkB,GAAG,SAAS,CAAC,CA8D1C"}
|
|
@@ -7,7 +7,6 @@ const promptTemplate_0 = "{{#section \"system\"}}\nYou are a browser automation
|
|
|
7
7
|
const reporter_1 = require("../../reporter");
|
|
8
8
|
const html_1 = require("../../utils/html");
|
|
9
9
|
const utils_1 = require("../utils");
|
|
10
|
-
const o1_completion_1 = require("./o1-completion");
|
|
11
10
|
async function executeTaskUsingBrowsingAgent({ action, page, actions, llm, trace, }) {
|
|
12
11
|
let generatedCodeSteps;
|
|
13
12
|
const tools = actions.getBrowsingActionSchemas();
|
|
@@ -31,24 +30,20 @@ async function executeTaskUsingBrowsingAgent({ action, page, actions, llm, trace
|
|
|
31
30
|
const promptSpan = browsingAgentSpan?.span({ name: "page-prompt" });
|
|
32
31
|
const messages = (0, llm_1.compilePrompt)(promptTemplate_0, { pageSnapshot, task: action });
|
|
33
32
|
promptSpan?.end({ output: { messages } });
|
|
34
|
-
|
|
35
|
-
|
|
33
|
+
llm =
|
|
34
|
+
llm ||
|
|
35
|
+
new llm_1.LLM({
|
|
36
|
+
provider: constants_1.DEFAULT_MODEL_PROVIDER,
|
|
37
|
+
});
|
|
38
|
+
const completion = await llm.createChatCompletion({
|
|
39
|
+
model: "o3-mini-2025-01-31",
|
|
36
40
|
messages,
|
|
37
41
|
tools,
|
|
38
42
|
trace: browsingAgentSpan,
|
|
43
|
+
modelParameters: {
|
|
44
|
+
tool_choice: "required",
|
|
45
|
+
},
|
|
39
46
|
});
|
|
40
|
-
// If O1 completion fails due to any reason, resort to old flow
|
|
41
|
-
if (!completion) {
|
|
42
|
-
completion = await llm.createChatCompletion({
|
|
43
|
-
messages,
|
|
44
|
-
tools,
|
|
45
|
-
trace: browsingAgentSpan,
|
|
46
|
-
modelParameters: {
|
|
47
|
-
...constants_1.DEFAULT_MODEL_PARAMETERS,
|
|
48
|
-
tool_choice: "required",
|
|
49
|
-
},
|
|
50
|
-
});
|
|
51
|
-
}
|
|
52
47
|
const toolCalls = completion?.tool_calls || [];
|
|
53
48
|
const toolCallsSpan = browsingAgentSpan?.span({ name: "tool-calls" });
|
|
54
49
|
for (const i in toolCalls) {
|
|
@@ -64,9 +59,10 @@ async function executeTaskUsingBrowsingAgent({ action, page, actions, llm, trace
|
|
|
64
59
|
}
|
|
65
60
|
catch (e) {
|
|
66
61
|
void testgenUpdatesReporter.sendMessage(e.message);
|
|
62
|
+
throw e;
|
|
67
63
|
}
|
|
68
64
|
}
|
|
69
|
-
toolCallsSpan?.end({ output: { toolCalls } });
|
|
65
|
+
toolCallsSpan?.end({ output: { toolCalls, generatedCodeSteps } });
|
|
70
66
|
return generatedCodeSteps;
|
|
71
67
|
}
|
|
72
68
|
exports.executeTaskUsingBrowsingAgent = executeTaskUsingBrowsingAgent;
|
|
@@ -81,12 +81,14 @@ fixtures_1.test.afterEach(async () => {
|
|
|
81
81
|
const blogPage = await page.context().newPage();
|
|
82
82
|
await blogPage.goto(`${server.baseURL}/blog-page.html`);
|
|
83
83
|
const response = await (0, run_1.createTestUsingMasterAgent)({
|
|
84
|
-
task: `subscribe as user@example.com on the
|
|
84
|
+
task: `we have 2 pages open inside a web browser, and your task is to subscribe as user@example.com on the blogPage`,
|
|
85
85
|
page,
|
|
86
86
|
testCase: {
|
|
87
87
|
id: 1,
|
|
88
88
|
name: "subscribe to blog",
|
|
89
|
-
steps: [
|
|
89
|
+
steps: [
|
|
90
|
+
"we have 2 pages open inside a web browser, and your task is to subscribe as user@example.com on the blogPage",
|
|
91
|
+
],
|
|
90
92
|
filePath: "blog.spec.ts",
|
|
91
93
|
suites: [],
|
|
92
94
|
},
|
|
@@ -15,5 +15,10 @@ export declare function executeBrowserAction({ page, nextAction, flags, actions,
|
|
|
15
15
|
llm: LLM;
|
|
16
16
|
}): Promise<{
|
|
17
17
|
generatedCodeSteps: string[];
|
|
18
|
+
output: {
|
|
19
|
+
action: string;
|
|
20
|
+
reason: string;
|
|
21
|
+
elementAnnotation?: string | undefined;
|
|
22
|
+
};
|
|
18
23
|
}>;
|
|
19
24
|
//# sourceMappingURL=execute-browser-action.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"execute-browser-action.d.ts","sourceRoot":"","sources":["../../../src/agent/master/execute-browser-action.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAC;AAE7C,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAYlD,wBAAsB,oBAAoB,CAAC,EACzC,IAAI,EACJ,UAAU,EACV,KAAK,EACL,OAAO,EACP,GAAG,EACH,KAAK,GACN,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,UAAU,EAAE;QACV,UAAU,EAAE,MAAM,CAAC;QACnB,YAAY,EAAE,MAAM,CAAC;KACtB,CAAC;IACF,KAAK,EAAE;QACL,4BAA4B,EAAE,OAAO,CAAC;KACvC,CAAC;IACF,OAAO,EAAE,iBAAiB,CAAC;IAC3B,KAAK,EAAE,WAAW,GAAG,SAAS,CAAC;IAC/B,GAAG,EAAE,GAAG,CAAC;CACV
|
|
1
|
+
{"version":3,"file":"execute-browser-action.d.ts","sourceRoot":"","sources":["../../../src/agent/master/execute-browser-action.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAC;AAE7C,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAYlD,wBAAsB,oBAAoB,CAAC,EACzC,IAAI,EACJ,UAAU,EACV,KAAK,EACL,OAAO,EACP,GAAG,EACH,KAAK,GACN,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,UAAU,EAAE;QACV,UAAU,EAAE,MAAM,CAAC;QACnB,YAAY,EAAE,MAAM,CAAC;KACtB,CAAC;IACF,KAAK,EAAE;QACL,4BAA4B,EAAE,OAAO,CAAC;KACvC,CAAC;IACF,OAAO,EAAE,iBAAiB,CAAC;IAC3B,KAAK,EAAE,WAAW,GAAG,SAAS,CAAC;IAC/B,GAAG,EAAE,GAAG,CAAC;CACV;;;gBAIW,MAAM;gBACN,MAAM;;;GAkIjB"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/agent/master/run.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EACV,QAAQ,EACR,oBAAoB,EACrB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAelC,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;
|
|
1
|
+
{"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/agent/master/run.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EACV,QAAQ,EACR,oBAAoB,EACrB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAelC,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAgCxC,wBAAsB,0BAA0B,CAAC,EAC/C,IAAI,EACJ,IAAI,EACJ,QAAQ,EACR,QAAQ,EACR,OAAO,EACP,SAAS,GACV,EAAE;IACD,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,IAAI,CAAC;IACX,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,OAAO,CAAC,oBAAoB,CAAC,CAAC;IACvC,SAAS,CAAC,EAAE,SAAS,CAAC;CACvB;;;GAwRA"}
|
package/dist/agent/master/run.js
CHANGED
|
@@ -23,7 +23,10 @@ const next_action_1 = require("./next-action");
|
|
|
23
23
|
const MAX_ERROR_COUNT = 2;
|
|
24
24
|
function getPageVariables(stateVariables) {
|
|
25
25
|
const keys = Object.keys(stateVariables);
|
|
26
|
-
|
|
26
|
+
// This checks for whether page.url() exists, which is true for all pages
|
|
27
|
+
// created by playwright actions.
|
|
28
|
+
const pageVariables = keys.filter((key) => typeof stateVariables[key] === "object" &&
|
|
29
|
+
typeof stateVariables[key]?.url === "function");
|
|
27
30
|
const pages = pageVariables.reduce((acc, key) => {
|
|
28
31
|
acc[key] = stateVariables[key];
|
|
29
32
|
return acc;
|
|
@@ -112,7 +115,7 @@ async function createTestUsingMasterAgent({ task, page, testCase, specPath, opti
|
|
|
112
115
|
task,
|
|
113
116
|
successfulActions: [...masterAgentActions],
|
|
114
117
|
pages: getPageVariables(actions.getStateVariables()),
|
|
115
|
-
currentPage: testGenPage
|
|
118
|
+
currentPage: testGenPage,
|
|
116
119
|
});
|
|
117
120
|
isGivenTaskDone = plannerResp.isDone;
|
|
118
121
|
if (isGivenTaskDone) {
|
|
@@ -188,7 +191,7 @@ async function createTestUsingMasterAgent({ task, page, testCase, specPath, opti
|
|
|
188
191
|
break;
|
|
189
192
|
}
|
|
190
193
|
default: {
|
|
191
|
-
const
|
|
194
|
+
const result = await (0, execute_browser_action_1.executeBrowserAction)({
|
|
192
195
|
page,
|
|
193
196
|
nextAction,
|
|
194
197
|
flags: {
|
|
@@ -198,7 +201,9 @@ async function createTestUsingMasterAgent({ task, page, testCase, specPath, opti
|
|
|
198
201
|
llm,
|
|
199
202
|
trace: masterAgentActionSpan,
|
|
200
203
|
});
|
|
204
|
+
const { generatedCodeSteps: codeFromExecuteAction, output: outputFromExecuteAction, } = result;
|
|
201
205
|
generatedCodeSteps.push(...codeFromExecuteAction);
|
|
206
|
+
output = outputFromExecuteAction;
|
|
202
207
|
}
|
|
203
208
|
}
|
|
204
209
|
// resetting error count as there is a successful action
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import { TraceClient } from "@empiricalrun/llm";
|
|
2
|
+
import type { Page } from "playwright";
|
|
3
|
+
import { TestGenPage } from "../../types";
|
|
2
4
|
export declare function runtimePlanner({ trace, task, successfulActions, pages, currentPage, }: {
|
|
3
5
|
trace?: TraceClient;
|
|
4
6
|
successfulActions: string[];
|
|
5
7
|
task: string;
|
|
6
|
-
pages
|
|
7
|
-
currentPage
|
|
8
|
+
pages: Record<string, Page>;
|
|
9
|
+
currentPage: TestGenPage;
|
|
8
10
|
}): Promise<{
|
|
9
11
|
pageName: string;
|
|
10
12
|
isDone: boolean;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"run-time-planner.d.ts","sourceRoot":"","sources":["../../../src/agent/planner/run-time-planner.ts"],"names":[],"mappings":"AAAA,OAAO,EAAsB,WAAW,EAAE,MAAM,mBAAmB,CAAC;
|
|
1
|
+
{"version":3,"file":"run-time-planner.d.ts","sourceRoot":"","sources":["../../../src/agent/planner/run-time-planner.ts"],"names":[],"mappings":"AAAA,OAAO,EAAsB,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACpE,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAGvC,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAG1C,wBAAsB,cAAc,CAAC,EACnC,KAAK,EACL,IAAI,EACJ,iBAAiB,EACjB,KAAK,EACL,WAAW,GACZ,EAAE;IACD,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,iBAAiB,EAAE,MAAM,EAAE,CAAC;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAC5B,WAAW,EAAE,WAAW,CAAC;CAC1B;;;;GA+FA"}
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.runtimePlanner = void 0;
|
|
4
4
|
const llm_1 = require("@empiricalrun/llm");
|
|
5
|
-
const promptTemplate_0 = "{{#section \"system\"}}\
|
|
5
|
+
const promptTemplate_0 = "{{#section \"system\"}}\nYou are given a list of successfully executed actions that are done towards completing a task (which\nis also provided to you). Your goal is to analyse the list and determine if the task is completed.\n\nIf the task is not fully completed, identify which specific actions are missing\nand suggest next steps to complete the task. Assume that the conversation provided\nis entirely truthful and no additional actions were performed beyond those listed.\n\nThese actions were executed by AI agents using Playwright on a browser. These agents\nalready have access to browser tabs to execute actions. If there is a pending action,\none of the agents will execute it in the browser. However, they need your help to\nchoose which browser tab (= page) to use for the next action.\n\nTo fulfil your goal, follow these steps:\n- Divide the task into individual actions.\n- Compare each task action against the actions listed in the successfully executed actions list.\n- Identify which actions have been executed and which have not.\n- If all actions are executed, respond with the task as done.\n- If any actions are missing, respond with the task as not done, listing all actions\n and specifying which are complete and which are missing.\n- If provided with list of pages, based on the next pending action and previously executed\n action, identify the page on which next action needs to be taken\n{{/section}}\n\n{{#section \"user\"}}\nTask:\n{{task}}\n\n----\n\nSuccessfully executed actions:\n{{successfulActions}}\n\n----\n\nList of pages with their current URLs:\n{{pagesSummary}}\n\n\n{{/section}}\n";
|
|
6
6
|
const utils_1 = require("../utils");
|
|
7
7
|
async function runtimePlanner({ trace, task, successfulActions, pages, currentPage, }) {
|
|
8
8
|
const runTimePlannerSpan = trace?.span({
|
|
@@ -14,6 +14,12 @@ async function runtimePlanner({ trace, task, successfulActions, pages, currentPa
|
|
|
14
14
|
},
|
|
15
15
|
});
|
|
16
16
|
const llm = new llm_1.LLM({ provider: "openai" });
|
|
17
|
+
// Add the current page to the page state variables
|
|
18
|
+
pages[currentPage.name] = currentPage.pwPageInstance;
|
|
19
|
+
const pagesSummary = pages
|
|
20
|
+
? Object.entries(pages).map(([pageName, page]) => `${pageName}: Currently on ${page.url()}`)
|
|
21
|
+
: [];
|
|
22
|
+
const pageNamesEnum = Object.keys(pages);
|
|
17
23
|
const response = await llm.createChatCompletion({
|
|
18
24
|
trace: runTimePlannerSpan,
|
|
19
25
|
traceName: "runtime-planner-llm",
|
|
@@ -21,7 +27,7 @@ async function runtimePlanner({ trace, task, successfulActions, pages, currentPa
|
|
|
21
27
|
messages: (0, llm_1.compilePrompt)(promptTemplate_0, {
|
|
22
28
|
task,
|
|
23
29
|
successfulActions: successfulActions.join("\n"),
|
|
24
|
-
|
|
30
|
+
pagesSummary: pagesSummary.join("\n"),
|
|
25
31
|
}),
|
|
26
32
|
tools: [
|
|
27
33
|
{
|
|
@@ -44,17 +50,21 @@ async function runtimePlanner({ trace, task, successfulActions, pages, currentPa
|
|
|
44
50
|
type: "string",
|
|
45
51
|
description: "reasoning for identification of task status",
|
|
46
52
|
},
|
|
47
|
-
|
|
48
|
-
type: "
|
|
49
|
-
description: "
|
|
53
|
+
nextAction: {
|
|
54
|
+
type: "string",
|
|
55
|
+
description: "next action to be taken",
|
|
50
56
|
},
|
|
51
57
|
pageName: {
|
|
52
58
|
type: "string",
|
|
53
|
-
enum:
|
|
59
|
+
enum: pageNamesEnum,
|
|
54
60
|
description: "page name for the next action.",
|
|
55
61
|
},
|
|
62
|
+
isDone: {
|
|
63
|
+
type: "boolean",
|
|
64
|
+
description: "whether the task is done",
|
|
65
|
+
},
|
|
56
66
|
},
|
|
57
|
-
required: ["isDone", "reason", "pageName"],
|
|
67
|
+
required: ["isDone", "reason", "pageName", "nextAction"],
|
|
58
68
|
},
|
|
59
69
|
},
|
|
60
70
|
},
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@empiricalrun/test-gen",
|
|
3
|
-
"version": "0.46.
|
|
3
|
+
"version": "0.46.10",
|
|
4
4
|
"publishConfig": {
|
|
5
5
|
"registry": "https://registry.npmjs.org/",
|
|
6
6
|
"access": "public"
|
|
@@ -73,9 +73,9 @@
|
|
|
73
73
|
"ts-morph": "^23.0.0",
|
|
74
74
|
"tsx": "^4.16.2",
|
|
75
75
|
"typescript": "^5.3.3",
|
|
76
|
-
"@empiricalrun/llm": "^0.9.35",
|
|
77
76
|
"@empiricalrun/r2-uploader": "^0.3.8",
|
|
78
|
-
"@empiricalrun/reporter": "^0.23.1"
|
|
77
|
+
"@empiricalrun/reporter": "^0.23.1",
|
|
78
|
+
"@empiricalrun/llm": "^0.9.35"
|
|
79
79
|
},
|
|
80
80
|
"devDependencies": {
|
|
81
81
|
"@playwright/test": "1.47.1",
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
import { TraceClient } from "@empiricalrun/llm";
|
|
2
|
-
import { ChatCompletionMessage, ChatCompletionMessageParam, ChatCompletionTool } from "openai/resources/index.mjs";
|
|
3
|
-
export declare function getO1Completion({ messages, tools, trace, }: {
|
|
4
|
-
messages: ChatCompletionMessageParam[];
|
|
5
|
-
tools: ChatCompletionTool[];
|
|
6
|
-
trace?: TraceClient;
|
|
7
|
-
}): Promise<ChatCompletionMessage | undefined>;
|
|
8
|
-
//# sourceMappingURL=o1-completion.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"o1-completion.d.ts","sourceRoot":"","sources":["../../../src/agent/browsing/o1-completion.ts"],"names":[],"mappings":"AAAA,OAAO,EAAO,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,EACL,qBAAqB,EACrB,0BAA0B,EAC1B,kBAAkB,EACnB,MAAM,4BAA4B,CAAC;AAMpC,wBAAsB,eAAe,CAAC,EACpC,QAAQ,EACR,KAAK,EACL,KAAK,GACN,EAAE;IACD,QAAQ,EAAE,0BAA0B,EAAE,CAAC;IACvC,KAAK,EAAE,kBAAkB,EAAE,CAAC;IAC5B,KAAK,CAAC,EAAE,WAAW,CAAC;CACrB,GAAG,OAAO,CAAC,qBAAqB,GAAG,SAAS,CAAC,CA8D7C"}
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
-
};
|
|
5
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
-
exports.getO1Completion = void 0;
|
|
7
|
-
const llm_1 = require("@empiricalrun/llm");
|
|
8
|
-
const remove_markdown_1 = __importDefault(require("remove-markdown"));
|
|
9
|
-
const constants_1 = require("../../constants");
|
|
10
|
-
const utils_1 = require("../utils");
|
|
11
|
-
async function getO1Completion({ messages, tools, trace, }) {
|
|
12
|
-
let completion;
|
|
13
|
-
try {
|
|
14
|
-
const o1Span = trace?.span({ name: "o1-response-span" });
|
|
15
|
-
const llm = new llm_1.LLM({
|
|
16
|
-
trace: o1Span,
|
|
17
|
-
provider: "openai",
|
|
18
|
-
defaultModel: "o1-mini",
|
|
19
|
-
providerApiKey: constants_1.MODEL_API_KEYS["openai"],
|
|
20
|
-
});
|
|
21
|
-
const [userInstruction] = messages.filter((s) => s.role === "user");
|
|
22
|
-
const [systemInstruction] = messages.filter((s) => s.role === "system");
|
|
23
|
-
userInstruction.content = `${systemInstruction?.content}
|
|
24
|
-
|
|
25
|
-
${userInstruction?.content}
|
|
26
|
-
|
|
27
|
-
You need to respond with one of the following tool call with provided schema:
|
|
28
|
-
${tools.map((tool) => JSON.stringify(tool, null, 2)).join("\n --- \n")}
|
|
29
|
-
|
|
30
|
-
------
|
|
31
|
-
|
|
32
|
-
Before responding, ensure the following:
|
|
33
|
-
- Do not respond with markdown, respond only with the JSON object.
|
|
34
|
-
- Do not respond with any backticks.
|
|
35
|
-
- The reason for action should also include what was been executed in the action.
|
|
36
|
-
`;
|
|
37
|
-
o1Span?.update({ input: [userInstruction] });
|
|
38
|
-
const response = (await llm.createChatCompletion({
|
|
39
|
-
messages: [userInstruction],
|
|
40
|
-
modelParameters: {
|
|
41
|
-
...constants_1.DEFAULT_O1_MODEL_PARAMETERS,
|
|
42
|
-
},
|
|
43
|
-
}));
|
|
44
|
-
o1Span?.end({ output: response });
|
|
45
|
-
const toolResponseStr = (0, remove_markdown_1.default)(response.content);
|
|
46
|
-
const toolRespJSON = (0, utils_1.parseJson)(toolResponseStr);
|
|
47
|
-
const parameters = toolRespJSON.function.parameters || toolRespJSON.function.arguments;
|
|
48
|
-
if (!parameters) {
|
|
49
|
-
throw new Error("No parameters found in tool response");
|
|
50
|
-
}
|
|
51
|
-
const tool = {
|
|
52
|
-
type: "function",
|
|
53
|
-
function: {
|
|
54
|
-
name: toolRespJSON.function.name,
|
|
55
|
-
arguments: typeof parameters === "string"
|
|
56
|
-
? parameters
|
|
57
|
-
: JSON.stringify(parameters, null, 2),
|
|
58
|
-
},
|
|
59
|
-
};
|
|
60
|
-
completion = {
|
|
61
|
-
role: "assistant",
|
|
62
|
-
content: response.content,
|
|
63
|
-
tool_calls: [tool],
|
|
64
|
-
};
|
|
65
|
-
}
|
|
66
|
-
catch (e) {
|
|
67
|
-
console.error("O1 response error", e);
|
|
68
|
-
return undefined;
|
|
69
|
-
}
|
|
70
|
-
return completion;
|
|
71
|
-
}
|
|
72
|
-
exports.getO1Completion = getO1Completion;
|