@empiricalrun/test-gen 0.42.17 → 0.42.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +17 -0
- package/dist/agent/browsing/index.d.ts +4 -6
- package/dist/agent/browsing/index.d.ts.map +1 -1
- package/dist/agent/browsing/index.js +49 -127
- package/dist/agent/browsing/utils.d.ts +0 -7
- package/dist/agent/browsing/utils.d.ts.map +1 -1
- package/dist/agent/browsing/utils.js +1 -13
- package/dist/agent/codegen/create-test-block.d.ts.map +1 -1
- package/dist/agent/codegen/create-test-block.js +2 -2
- package/dist/agent/codegen/generate-code-apply-changes.d.ts.map +1 -1
- package/dist/agent/codegen/generate-code-apply-changes.js +77 -117
- package/dist/agent/codegen/lexical-scoped-vars.d.ts.map +1 -1
- package/dist/agent/codegen/lexical-scoped-vars.js +2 -2
- package/dist/agent/master/element-annotation.d.ts.map +1 -1
- package/dist/agent/master/element-annotation.js +7 -53
- package/dist/agent/master/run.d.ts.map +1 -1
- package/dist/agent/master/run.js +6 -7
- package/dist/prompts/lib/index.d.ts +8 -0
- package/dist/prompts/lib/index.d.ts.map +1 -0
- package/dist/prompts/lib/index.js +118 -0
- package/package.json +2 -2
- package/dist/agent/codegen/promptBuilder.d.ts +0 -3
- package/dist/agent/codegen/promptBuilder.d.ts.map +0 -1
- package/dist/agent/codegen/promptBuilder.js +0 -44
- package/dist/agent/verification/index.d.ts +0 -13
- package/dist/agent/verification/index.d.ts.map +0 -1
- package/dist/agent/verification/index.js +0 -84
- package/dist/evals/verification-agent.evals.d.ts +0 -4
- package/dist/evals/verification-agent.evals.d.ts.map +0 -1
- package/dist/evals/verification-agent.evals.js +0 -23
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,22 @@
|
|
|
1
1
|
# @empiricalrun/test-gen
|
|
2
2
|
|
|
3
|
+
## 0.42.19
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- c36efe4: chore: remove any type for get-next-action output
|
|
8
|
+
- ebb0bfa: feat: support images in handlebar prompts
|
|
9
|
+
- 63ed479: fix: remove verification and looping inside browsing agent
|
|
10
|
+
|
|
11
|
+
## 0.42.18
|
|
12
|
+
|
|
13
|
+
### Patch Changes
|
|
14
|
+
|
|
15
|
+
- 6f876ea: fix: remove code generation feedback loop from repo edit
|
|
16
|
+
- 658451e: fix: tracing for vitests
|
|
17
|
+
- Updated dependencies [658451e]
|
|
18
|
+
- @empiricalrun/llm@0.9.32
|
|
19
|
+
|
|
3
20
|
## 0.42.17
|
|
4
21
|
|
|
5
22
|
### Patch Changes
|
|
@@ -1,20 +1,18 @@
|
|
|
1
1
|
import { LLM, TraceClient } from "@empiricalrun/llm";
|
|
2
2
|
import { Page } from "playwright";
|
|
3
3
|
import { PlaywrightActions } from "../../actions";
|
|
4
|
-
import { CustomLogger } from "../../bin/logger";
|
|
5
4
|
import { TestGenConfigOptions } from "../../types";
|
|
6
5
|
export type BrowsingAgentOptions = Partial<TestGenConfigOptions> & {
|
|
7
6
|
htmlSanitize?: {
|
|
8
7
|
disallowedStrings?: string[];
|
|
9
8
|
};
|
|
10
9
|
};
|
|
11
|
-
export declare function executeTaskUsingBrowsingAgent({
|
|
10
|
+
export declare function executeTaskUsingBrowsingAgent({ action, page, actions, llm, options, trace, }: {
|
|
12
11
|
action: string;
|
|
13
|
-
trace?: TraceClient;
|
|
14
|
-
logger: CustomLogger;
|
|
15
12
|
page: Page;
|
|
16
|
-
options: BrowsingAgentOptions;
|
|
17
|
-
llm: LLM;
|
|
18
13
|
actions: PlaywrightActions;
|
|
14
|
+
llm: LLM;
|
|
15
|
+
trace?: TraceClient;
|
|
16
|
+
options: BrowsingAgentOptions;
|
|
19
17
|
}): Promise<string[] | undefined>;
|
|
20
18
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/browsing/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAElC,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/browsing/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAElC,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAKlD,OAAO,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAKnD,MAAM,MAAM,oBAAoB,GAAG,OAAO,CAAC,oBAAoB,CAAC,GAAG;IACjE,YAAY,CAAC,EAAE;QACb,iBAAiB,CAAC,EAAE,MAAM,EAAE,CAAC;KAC9B,CAAC;CACH,CAAC;AAEF,wBAAsB,6BAA6B,CAAC,EAClD,MAAM,EACN,IAAI,EACJ,OAAO,EACP,GAAG,EACH,OAAO,EACP,KAAK,GACN,EAAE;IACD,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,IAAI,CAAC;IACX,OAAO,EAAE,iBAAiB,CAAC;IAC3B,GAAG,EAAE,GAAG,CAAC;IACT,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,OAAO,EAAE,oBAAoB,CAAC;CAC/B,GAAG,OAAO,CAAC,MAAM,EAAE,GAAG,SAAS,CAAC,CAiEhC"}
|
|
@@ -2,148 +2,70 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.executeTaskUsingBrowsingAgent = void 0;
|
|
4
4
|
const constants_1 = require("../../constants");
|
|
5
|
+
const promptTemplate_0 = "{{#section \"system\"}}\nYou are a browser automation agent who is given a task to generate code for navigation and assertion. This task is your\ngoal and you must achieve it.\n\nYou will be provided with already executed actions and basis that you need to pick the next step to achieve the task.\nRemember that the goal must be achieved.\n\nYou will be provided with the web page snapshot in the form of Document Object Model. Based on the goal and available\ntool calls you need to pick the appropriate tool call.\n\nInstructions:\n- Take actions one at a time. Do not try to take multiple actions\n- You can respond with multiple assertions in one shot\n- Do not repeat the same actions again otherwise your response will be marked INVALID\n- Avoid repeating errors which we got while executing the last action\n- Stick to the task provided to you and mark the task done once the task is complete\n- Do not execute any action which is not mentioned in the task\n- Do not repeat actions which are already executed more than twice otherwise your response will be marked INVALID\n- Always refer to \"Executed actions\" before deciding your next action for completion of the task.\n- End the task done if all actions required for task are executed\n{{/section}}\n\n{{#section \"user\"}}\nTask:\n{{task}}\n\nCurrent page snapshot:\n{{pageSnapshot}}\n{{/section}}";
|
|
6
|
+
const lib_1 = require("../../prompts/lib");
|
|
5
7
|
const reporter_1 = require("../../reporter");
|
|
6
|
-
const session_1 = require("../../session");
|
|
7
8
|
const html_1 = require("../../utils/html");
|
|
8
9
|
const utils_1 = require("../utils");
|
|
9
|
-
const verification_1 = require("../verification");
|
|
10
10
|
const o1_completion_1 = require("./o1-completion");
|
|
11
|
-
|
|
12
|
-
async function executeTaskUsingBrowsingAgent({ trace, action, logger, page, options, llm, actions, }) {
|
|
13
|
-
let isTaskDone = false;
|
|
14
|
-
const executedActions = [];
|
|
15
|
-
let lastActionExecTrace = "";
|
|
11
|
+
async function executeTaskUsingBrowsingAgent({ action, page, actions, llm, options, trace, }) {
|
|
16
12
|
let generatedCodeSteps = [];
|
|
17
13
|
const tools = actions.getBrowsingActionSchemas();
|
|
18
14
|
const testgenUpdatesReporter = new reporter_1.TestGenUpdatesReporter();
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
task: action,
|
|
48
|
-
conversation: ["Successfully executed actions", ...successfulActions],
|
|
49
|
-
});
|
|
50
|
-
isTaskDone = verificationAgentResp.isDone;
|
|
51
|
-
logger.log(`isTaskDone: ${isTaskDone}`);
|
|
52
|
-
logger.log(`reason: ${verificationAgentResp.reason}`);
|
|
53
|
-
if (isTaskDone) {
|
|
54
|
-
browsingAgentSpan?.event({ name: "task-done" });
|
|
55
|
-
browsingAgentSpan?.end({
|
|
56
|
-
output: {
|
|
57
|
-
taskDone: true,
|
|
58
|
-
reason: verificationAgentResp.reason,
|
|
59
|
-
code: generatedCodeSteps,
|
|
60
|
-
},
|
|
61
|
-
});
|
|
62
|
-
break;
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
const messages = await (0, utils_2.getPromptForNextAction)({
|
|
66
|
-
pageSnapshot,
|
|
67
|
-
previousActions: successfulActions,
|
|
68
|
-
task: action,
|
|
69
|
-
lastActionErrors: lastActionExecTrace ? [lastActionExecTrace] : [],
|
|
70
|
-
promptType: "browsing-agent-as-tool",
|
|
71
|
-
});
|
|
72
|
-
promptSpan?.end({ output: { messages } });
|
|
73
|
-
let completion;
|
|
74
|
-
completion = await (0, o1_completion_1.getO1Completion)({
|
|
75
|
-
//@ts-ignore
|
|
15
|
+
const browsingAgentSpan = trace?.span({
|
|
16
|
+
name: `browsing-agent`,
|
|
17
|
+
input: {
|
|
18
|
+
action,
|
|
19
|
+
},
|
|
20
|
+
});
|
|
21
|
+
const pageContentSpan = browsingAgentSpan?.span({
|
|
22
|
+
name: "page-content",
|
|
23
|
+
});
|
|
24
|
+
const pageContent = await page.content();
|
|
25
|
+
pageContentSpan?.end({ output: { pageContent } });
|
|
26
|
+
const sanitizationSpan = browsingAgentSpan?.span({
|
|
27
|
+
name: "page-sanitization",
|
|
28
|
+
});
|
|
29
|
+
const pageSnapshot = (0, html_1.sanitizeHtml)(pageContent, options.htmlSanitize);
|
|
30
|
+
sanitizationSpan?.end({ output: { pageSnapshot } });
|
|
31
|
+
const promptSpan = browsingAgentSpan?.span({ name: "page-prompt" });
|
|
32
|
+
const messages = await (0, lib_1.compilePrompt)(promptTemplate_0, { pageSnapshot, task: action });
|
|
33
|
+
promptSpan?.end({ output: { messages } });
|
|
34
|
+
let completion;
|
|
35
|
+
completion = await (0, o1_completion_1.getO1Completion)({
|
|
36
|
+
messages,
|
|
37
|
+
tools,
|
|
38
|
+
trace: browsingAgentSpan,
|
|
39
|
+
});
|
|
40
|
+
// If O1 completion fails due to any reason, resort to old flow
|
|
41
|
+
if (!completion) {
|
|
42
|
+
completion = await llm.createChatCompletion({
|
|
76
43
|
messages,
|
|
77
44
|
tools,
|
|
78
45
|
trace: browsingAgentSpan,
|
|
46
|
+
model: options.model || constants_1.DEFAULT_MODEL,
|
|
47
|
+
modelParameters: {
|
|
48
|
+
...constants_1.DEFAULT_MODEL_PARAMETERS,
|
|
49
|
+
...options.modelParameters,
|
|
50
|
+
tool_choice: "required",
|
|
51
|
+
},
|
|
79
52
|
});
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
...options.modelParameters,
|
|
90
|
-
tool_choice: "required",
|
|
91
|
-
},
|
|
92
|
-
});
|
|
93
|
-
}
|
|
94
|
-
const toolCalls = completion?.tool_calls || [];
|
|
95
|
-
// LLM might respond with empty tool_calls and we can go into endless loop
|
|
96
|
-
// if we donot record this action and mark it as error
|
|
97
|
-
if (!toolCalls.length) {
|
|
98
|
-
executedActions.push({
|
|
99
|
-
isError: true,
|
|
100
|
-
action: "",
|
|
101
|
-
});
|
|
102
|
-
}
|
|
103
|
-
const toolCallsSpan = browsingAgentSpan?.span({ name: "tool-calls" });
|
|
104
|
-
for (const i in toolCalls) {
|
|
105
|
-
const toolCall = toolCalls[i];
|
|
106
|
-
if (await (0, session_1.shouldStopSession)()) {
|
|
107
|
-
break;
|
|
108
|
-
}
|
|
109
|
-
try {
|
|
110
|
-
const code = await actions.executeAction(toolCall.function.name, (0, utils_1.parseJson)(toolCall.function.arguments), toolCallsSpan);
|
|
111
|
-
if (code) {
|
|
112
|
-
generatedCodeSteps.push(code);
|
|
113
|
-
}
|
|
114
|
-
executedActions.push({
|
|
115
|
-
isError: false,
|
|
116
|
-
action: (0, utils_1.parseJson)(toolCall.function.arguments)?.reason,
|
|
117
|
-
});
|
|
118
|
-
lastActionExecTrace = "";
|
|
119
|
-
}
|
|
120
|
-
catch (e) {
|
|
121
|
-
// TODO: implement feedback loop to llm
|
|
122
|
-
executedActions.push({
|
|
123
|
-
isError: true,
|
|
124
|
-
action: (0, utils_1.parseJson)(toolCall.function.arguments)?.reason,
|
|
125
|
-
});
|
|
126
|
-
lastActionExecTrace = e.message;
|
|
127
|
-
void testgenUpdatesReporter.sendMessage(e.message);
|
|
128
|
-
logger.error(lastActionExecTrace, e);
|
|
53
|
+
}
|
|
54
|
+
const toolCalls = completion?.tool_calls || [];
|
|
55
|
+
const toolCallsSpan = browsingAgentSpan?.span({ name: "tool-calls" });
|
|
56
|
+
for (const i in toolCalls) {
|
|
57
|
+
const toolCall = toolCalls[i];
|
|
58
|
+
try {
|
|
59
|
+
const code = await actions.executeAction(toolCall.function.name, (0, utils_1.parseJson)(toolCall.function.arguments), toolCallsSpan);
|
|
60
|
+
if (code) {
|
|
61
|
+
generatedCodeSteps.push(code);
|
|
129
62
|
}
|
|
130
63
|
}
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
if (executedActions.length >= 3) {
|
|
134
|
-
const lastThreeActions = executedActions.slice(-3);
|
|
135
|
-
const lastThreeActionsFailed = lastThreeActions.every((a) => a.isError);
|
|
136
|
-
// get last 3 lines of code
|
|
137
|
-
const isStuckInLoop = actions.isStuckInLoop();
|
|
138
|
-
if (lastThreeActionsFailed || isStuckInLoop) {
|
|
139
|
-
// TODO: this should be sent to dashboard
|
|
140
|
-
const error = "Agent is not able to figure out next browser action, ending retries";
|
|
141
|
-
logger.error(error);
|
|
142
|
-
await testgenUpdatesReporter.sendMessage(error);
|
|
143
|
-
throw Error(error);
|
|
144
|
-
}
|
|
64
|
+
catch (e) {
|
|
65
|
+
void testgenUpdatesReporter.sendMessage(e.message);
|
|
145
66
|
}
|
|
146
67
|
}
|
|
68
|
+
toolCallsSpan?.end({ output: { toolCalls } });
|
|
147
69
|
return generatedCodeSteps;
|
|
148
70
|
}
|
|
149
71
|
exports.executeTaskUsingBrowsingAgent = executeTaskUsingBrowsingAgent;
|
|
@@ -29,13 +29,6 @@ export declare function readPlaywrightConfig(): Promise<PlaywrightTestConfig>;
|
|
|
29
29
|
* @returns
|
|
30
30
|
*/
|
|
31
31
|
export declare function detectProjectName(testFilePath: string, playwrightConfig: PlaywrightTestConfig, pwProjectsFilter?: string[]): Promise<string>;
|
|
32
|
-
export declare function getPromptForNextAction({ pageSnapshot, task, previousActions, lastActionErrors, promptType, }: {
|
|
33
|
-
pageSnapshot: string;
|
|
34
|
-
task: string;
|
|
35
|
-
previousActions: string[];
|
|
36
|
-
lastActionErrors: string[];
|
|
37
|
-
promptType?: string;
|
|
38
|
-
}): Promise<import("openai/resources/index.mjs").ChatCompletionMessageParam[]>;
|
|
39
32
|
export declare class TeardownManager {
|
|
40
33
|
private directory;
|
|
41
34
|
constructor(directory: string);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../../../src/agent/browsing/utils.ts"],"names":[],"mappings":"AAAA,OAAO,
|
|
1
|
+
{"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../../../src/agent/browsing/utils.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAIhD,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAClC,OAAO,EAAE,oBAAoB,EAAE,MAAM,iBAAiB,CAAC;AAiBvD,OAAO,EAAe,aAAa,EAAE,MAAM,aAAa,CAAC;AAMzD,wBAAgB,QAAQ,CAAC,GAAG,EAAE,GAAG,GAAG,GAAG,IAAI,MAAM,CAKhD;AAED,wBAAgB,wBAAwB,CAAC,KAAK,EAAE,MAAM,EAAE,UAIvD;AA8FD;;;;GAIG;AACH,wBAAsB,yBAAyB,CAC7C,SAAS,EAAE,aAAa,EACxB,KAAK,CAAC,EAAE,WAAW,GAClB,OAAO,CAAC,MAAM,CAAC,CA0DjB;AAyBD,wBAAsB,wBAAwB,CAAC,IAAI,EAAE,IAAI,iBAuHxD;AAED;;;;GAIG;AACH,wBAAgB,iBAAiB,CAAC,QAAQ,EAAE,MAAM,QAIjD;AAED;;;GAGG;AACH,wBAAsB,oBAAoB,IAAI,OAAO,CAAC,oBAAoB,CAAC,CAM1E;AAWD;;;;;GAKG;AACH,wBAAsB,iBAAiB,CACrC,YAAY,EAAE,MAAM,EACpB,gBAAgB,EAAE,oBAAoB,EACtC,gBAAgB,GAAE,MAAM,EAAU,GACjC,OAAO,CAAC,MAAM,CAAC,CA+CjB;AAED,qBAAa,eAAe;IACd,OAAO,CAAC,SAAS;gBAAT,SAAS,EAAE,MAAM;IACrC,OAAO,CAAC,aAAa,CAAqB;YAE5B,mBAAmB;YAUnB,gBAAgB;IAsBjB,OAAO;IAuBb,SAAS;CAKjB"}
|
|
@@ -3,8 +3,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
3
3
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
4
|
};
|
|
5
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
-
exports.TeardownManager = exports.
|
|
7
|
-
const llm_1 = require("@empiricalrun/llm");
|
|
6
|
+
exports.TeardownManager = exports.detectProjectName = exports.readPlaywrightConfig = exports.canRunMasterAgent = exports.injectPwLocatorGenerator = exports.prepareFileForMasterAgent = exports.prepareBrowsingAgentTask = exports.isRegExp = void 0;
|
|
8
7
|
const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
9
8
|
const minimatch_1 = require("minimatch");
|
|
10
9
|
const path_1 = __importDefault(require("path"));
|
|
@@ -348,17 +347,6 @@ async function detectProjectName(testFilePath, playwrightConfig, pwProjectsFilte
|
|
|
348
347
|
return filteredProjectNames[0];
|
|
349
348
|
}
|
|
350
349
|
exports.detectProjectName = detectProjectName;
|
|
351
|
-
async function getPromptForNextAction({ pageSnapshot = "", task = "", previousActions = [], lastActionErrors = [], promptType = "browsing-agent-next-action", }) {
|
|
352
|
-
const previousActionsStr = previousActions.join("\n\n ---- \n\n");
|
|
353
|
-
const prompt = await (0, llm_1.getPrompt)(promptType, {
|
|
354
|
-
pageSnapshot,
|
|
355
|
-
previousActionsStr,
|
|
356
|
-
task,
|
|
357
|
-
lastActionErrors,
|
|
358
|
-
});
|
|
359
|
-
return prompt;
|
|
360
|
-
}
|
|
361
|
-
exports.getPromptForNextAction = getPromptForNextAction;
|
|
362
350
|
class TeardownManager {
|
|
363
351
|
directory;
|
|
364
352
|
constructor(directory) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"create-test-block.d.ts","sourceRoot":"","sources":["../../../src/agent/codegen/create-test-block.ts"],"names":[],"mappings":"AAAA,OAAO,EAAyB,WAAW,EAAE,MAAM,mBAAmB,CAAC;
|
|
1
|
+
{"version":3,"file":"create-test-block.d.ts","sourceRoot":"","sources":["../../../src/agent/codegen/create-test-block.ts"],"names":[],"mappings":"AAAA,OAAO,EAAyB,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAcvE,OAAO,EAAE,QAAQ,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAE7D,wBAAsB,wBAAwB,CAAC,EAC7C,QAAQ,EACR,IAAI,EACJ,OAAO,EACP,KAAK,GACN,EAAE;IACD,QAAQ,EAAE,QAAQ,CAAC;IACnB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,oBAAoB,CAAC;IAC/B,KAAK,CAAC,EAAE,WAAW,CAAC;CACrB,+BAqDA"}
|
|
@@ -7,8 +7,8 @@ const context_1 = require("../../bin/utils/context");
|
|
|
7
7
|
const web_1 = require("../../bin/utils/platform/web");
|
|
8
8
|
const constants_1 = require("../../constants");
|
|
9
9
|
const promptTemplate_0 = "{{#section \"system\"}}\nYou are a software test engineer who is given a task to write an empty test block.\nBased on the inputs you need to create an empty playwright test block with correctly imported fixture.\n\nThe test will contain a test name which you will need to use to build the empty test case block.\n\nYou will be provided with current tests, fixtures and page object models for you to use and create test case block as\nper the task provided to you.\n\nBefore responding you need to ensure that the code change is minimal and the change is reusable across tests. You need\nto ensure the code follows DRY principle.\n\nHere is the list of current tests and fixtures:\n\n{{testFiles}}\n\nHere is the list of current page object models:\n\n{{pageFiles}}\n{{/section}}\n\n{{#section \"user\"}}\nFollowing is the test scenario for which you need to write the empty test case block:\ntest name:\n{{scenarioName}}\n\ntask:\ncreate an empty test case block for the following test steps:\n{{scenario}}\n\ntest file path: {{scenarioFile}}\n\n------\n\nYou also need to ensure that the empty test case block has a starting page to begin test.\n\nIn order to identify the right page with which the test should start, follow the steps:\n- based on the similarities with other test cases mentioned in the file, identify the right page fixture to be imported\n- Read the page fixture methods step by step. Identify whether the fixture handles navigating to a page.\n- Identify whether other tests using the page fixture had to add separate steps for navigation or not\n- Based on the above analysis there will be following cases and choose either for the given test scenario:\n-- Case 1: if the test case scenario provided inside the task mentions about page navigation, then use that page\nnavigation. skip other cases if this case is satisfied.\n-- Case 2: refer other test cases which import similar fixtures and infer the first page navigation of this test case.\nYou should prefer tests which are in the same file. Tests within same file have higher overlaps in first page\nnavigation.\n- Once the page fixture is decided, look for userContext fixture in files. If its available then add \"userContext\" to\nthe test case block\n\n\n\nFollow these instructions before responding with output:\n- Read the code line by line and achieve the task provided to you\n- Read the dependencies of the code block by scanning through file paths and file provided to you. refer the same file\npath while responding with update\n- Focus only on the test case provided and associated JS methods called from the test case.\n- Respond only with the new empty test case block to be created and nothing else.\n- DO NOT respond with any backticks or markdown syntax\n- If \"userContext\" fixture is available in fixtures file, ensure importing that fixture in the test case block.\n- Provide a reason based on the test steps provided to you on why you chose the fixture or page.goto statement. The\nreason should be one of the list steps provided to you and mention why the case was chosen\n{{/section}}";
|
|
10
|
+
const lib_1 = require("../../prompts/lib");
|
|
10
11
|
const session_1 = require("../../session");
|
|
11
|
-
const promptBuilder_1 = require("./promptBuilder");
|
|
12
12
|
async function createEmptyTestCaseBlock({ testCase, file, options, trace, }) {
|
|
13
13
|
const logger = new logger_1.CustomLogger({ useReporter: false });
|
|
14
14
|
logger.log("Creating new test block");
|
|
@@ -29,7 +29,7 @@ async function createEmptyTestCaseBlock({ testCase, file, options, trace, }) {
|
|
|
29
29
|
const promptSpan = trace?.span({
|
|
30
30
|
name: "build-create-empty-test-case-prompt",
|
|
31
31
|
});
|
|
32
|
-
const prompt = await (0,
|
|
32
|
+
const prompt = await (0, lib_1.compilePrompt)(promptTemplate_0, {
|
|
33
33
|
testFiles: context.codePrompt,
|
|
34
34
|
pageFiles: context.pomPrompt,
|
|
35
35
|
scenarioName: testCase.name,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"generate-code-apply-changes.d.ts","sourceRoot":"","sources":["../../../src/agent/codegen/generate-code-apply-changes.ts"],"names":[],"mappings":"AAAA,OAAO,EAAuB,WAAW,EAAE,MAAM,mBAAmB,CAAC;
|
|
1
|
+
{"version":3,"file":"generate-code-apply-changes.d.ts","sourceRoot":"","sources":["../../../src/agent/codegen/generate-code-apply-changes.ts"],"names":[],"mappings":"AAAA,OAAO,EAAuB,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAQrE,OAAO,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAOhD,OAAO,EAAsB,UAAU,EAAE,MAAM,SAAS,CAAC;AAqLzD,wBAAgB,8BAA8B,CAAC,KAAK,EAAE,MAAM,UAkD3D;AA2DD,wBAAsB,2BAA2B,CAAC,EAChD,IAAI,EACJ,KAAK,EACL,MAAM,EACN,gBAAgB,GACjB,EAAE;IACD,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,MAAM,CAAC,EAAE,YAAY,CAAC;IACtB,gBAAgB,EAAE,MAAM,OAAO,CAAC;QAAE,MAAM,EAAE,MAAM,GAAG,SAAS,CAAA;KAAE,CAAC,CAAC;CACjE,GAAG,OAAO,CAAC,UAAU,EAAE,CAAC,CAuIxB"}
|
|
@@ -264,12 +264,11 @@ In order to execute the task, FOLLOW BELOW STEPS:
|
|
|
264
264
|
}
|
|
265
265
|
function deDupUpdatedFiles(updatedFiles) {
|
|
266
266
|
return updatedFiles.filter((change, index, self) => index ===
|
|
267
|
-
self.findIndex((existing) => existing.filePath === change.filePath
|
|
268
|
-
existing.oldCode === change.oldCode &&
|
|
269
|
-
existing.newCode === change.newCode));
|
|
267
|
+
self.findIndex((existing) => existing.filePath === change.filePath));
|
|
270
268
|
}
|
|
271
269
|
async function generateCodeAndApplyChanges({ task, trace, logger, getRelevantFiles, }) {
|
|
272
|
-
|
|
270
|
+
// Reducing this from 5 to 3, if the required changes are getting missed will change it back.
|
|
271
|
+
let planRetries = 3;
|
|
273
272
|
let updatedFiles = [];
|
|
274
273
|
while (planRetries--) {
|
|
275
274
|
const generateCodeAndApplyChangesSpan = trace?.span({
|
|
@@ -300,120 +299,81 @@ async function generateCodeAndApplyChanges({ task, trace, logger, getRelevantFil
|
|
|
300
299
|
await (0, llm_1.flushAllTraces)();
|
|
301
300
|
return deDupUpdatedFiles(updatedFiles);
|
|
302
301
|
}
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
oldCode: "",
|
|
353
|
-
newCode: args.code,
|
|
354
|
-
reason: args.reason,
|
|
355
|
-
});
|
|
356
|
-
await fs_extra_1.default.mkdir((0, path_1.dirname)(args.filePath), { recursive: true });
|
|
357
|
-
await fs_extra_1.default.writeFile(args.filePath, args.code, "utf-8");
|
|
358
|
-
console.log(`Created file: ${args.filePath}`);
|
|
359
|
-
})();
|
|
360
|
-
}));
|
|
361
|
-
const strReplaceToolCalls = completion.tool_calls.filter((tc) => tc.function.name === types_1.CodeEditorToolCall.STR_REPLACE);
|
|
362
|
-
if (strReplaceToolCalls.length > 0) {
|
|
363
|
-
console.log(`str_replace tool calls: `, strReplaceToolCalls);
|
|
364
|
-
}
|
|
365
|
-
// Filter out the tool calls which are for replacing code in existing files
|
|
366
|
-
const fileChanges = strReplaceToolCalls
|
|
367
|
-
.map((toolCall) => (0, utils_1.parseJson)(toolCall.function.arguments))
|
|
368
|
-
.filter((f) => f.filePath && fs_extra_1.default.existsSync(f.filePath));
|
|
369
|
-
updatedFiles.push(...fileChanges);
|
|
370
|
-
let failedCodeUpdates;
|
|
371
|
-
// applyChangesResponse contains the errors occurred while applying the changes
|
|
372
|
-
failedCodeUpdates = await (0, utils_2.applyFileChangesUsingStrReplace)({
|
|
373
|
-
trace: codeEditorSpan,
|
|
374
|
-
fileChanges,
|
|
375
|
-
logger,
|
|
376
|
-
});
|
|
377
|
-
// Filter out the responses having errors
|
|
378
|
-
failedCodeUpdates = failedCodeUpdates.filter((f) => f?.error);
|
|
379
|
-
// Filter out the tool calls which have errors
|
|
380
|
-
const toolCallsWithErrors = codeEditorToolCalls.filter((toolCall) => {
|
|
381
|
-
const args = (0, utils_1.parseJson)(toolCall.function.arguments);
|
|
382
|
-
return failedCodeUpdates.find((response) => response.filePath === args.filePath);
|
|
383
|
-
});
|
|
384
|
-
if (failedCodeUpdates.length === 0) {
|
|
385
|
-
break;
|
|
386
|
-
}
|
|
387
|
-
logger?.log(`Failed to apply changes, retrying...`, failedCodeUpdates);
|
|
388
|
-
const feedback = failedCodeUpdates
|
|
389
|
-
.map((updates) => `For file ${updates.filePath}: ${updates.errorMessage}`)
|
|
390
|
-
.join("\n");
|
|
391
|
-
promptForStrReplace.push({
|
|
392
|
-
role: "assistant",
|
|
393
|
-
tool_calls: toolCallsWithErrors,
|
|
394
|
-
});
|
|
395
|
-
toolCallsWithErrors.forEach((toolCall) => {
|
|
396
|
-
promptForStrReplace.push({
|
|
397
|
-
role: "tool",
|
|
398
|
-
tool_call_id: toolCall.id,
|
|
399
|
-
content: `
|
|
400
|
-
Errors while executing the changes provided in above tool call:
|
|
401
|
-
${feedback}
|
|
402
|
-
|
|
403
|
-
Please fix the errors and return the updated code.
|
|
404
|
-
|
|
405
|
-
FOLLOW BELOW STEPS TO FIX THE ISSUES:
|
|
406
|
-
- First read the error message and understand the issue.
|
|
407
|
-
- Go through the new code block and current file code, to figure out the root cause of the issue.
|
|
408
|
-
- Compile the steps that you need to follow to fix the issue.
|
|
409
|
-
- Check the test names, to ensure that the changes are applied to the correct test.
|
|
410
|
-
- Use separate 'str_replace' tool to make the changes for each update.
|
|
411
|
-
- Return the updated code in the same format as provided in the tool call.
|
|
412
|
-
|
|
413
|
-
NOTE: ONLY MAKE THE CHANGES TO FIX THE ISSUES MENTIONED IN THE ERROR MESSAGE AND NOTHING ELSE. NO EXTRA CODE REFACTORING IS REQUIRED.
|
|
414
|
-
`,
|
|
302
|
+
const promptForStrReplace = [
|
|
303
|
+
{
|
|
304
|
+
role: "system",
|
|
305
|
+
content: systemPromptBuilderForRepoEdit(files),
|
|
306
|
+
},
|
|
307
|
+
{
|
|
308
|
+
role: "user",
|
|
309
|
+
content: userPromptBuilderForStrReplace(strReplacePlan),
|
|
310
|
+
},
|
|
311
|
+
];
|
|
312
|
+
const codeEditorSpan = generateCodeAndApplyChangesSpan?.span({
|
|
313
|
+
name: "code-editor-agent",
|
|
314
|
+
input: {
|
|
315
|
+
prompt: promptForStrReplace,
|
|
316
|
+
},
|
|
317
|
+
});
|
|
318
|
+
const llm = new llm_1.LLM({
|
|
319
|
+
trace: codeEditorSpan,
|
|
320
|
+
provider: "anthropic",
|
|
321
|
+
defaultModel: "claude-3-5-sonnet-20240620",
|
|
322
|
+
providerApiKey: constants_1.MODEL_API_KEYS["anthropic"],
|
|
323
|
+
});
|
|
324
|
+
const completion = await llm.createChatCompletion({
|
|
325
|
+
messages: promptForStrReplace,
|
|
326
|
+
modelParameters: {
|
|
327
|
+
...constants_1.DEFAULT_MODEL_PARAMETERS,
|
|
328
|
+
temperature: 0.1,
|
|
329
|
+
tool_choice: "required",
|
|
330
|
+
},
|
|
331
|
+
trace: codeEditorSpan,
|
|
332
|
+
tools: getCodeEditorToolCalls(),
|
|
333
|
+
});
|
|
334
|
+
codeEditorSpan?.end({ output: { completion } });
|
|
335
|
+
if (!completion?.tool_calls || completion?.tool_calls?.length === 0) {
|
|
336
|
+
continue;
|
|
337
|
+
}
|
|
338
|
+
// Filter out the tool calls which are for creating new files
|
|
339
|
+
const createFileToolCalls = completion.tool_calls.filter((tc) => tc.function.name === types_1.CodeEditorToolCall.CREATE_FILE);
|
|
340
|
+
if (createFileToolCalls.length > 0) {
|
|
341
|
+
console.log(`create_file tool calls: `, createFileToolCalls);
|
|
342
|
+
}
|
|
343
|
+
await Promise.all(createFileToolCalls.map((tc) => {
|
|
344
|
+
return (async () => {
|
|
345
|
+
const args = (0, utils_1.parseJson)(tc.function.arguments);
|
|
346
|
+
updatedFiles.push({
|
|
347
|
+
filePath: args.filePath,
|
|
348
|
+
oldCode: "",
|
|
349
|
+
newCode: args.code,
|
|
350
|
+
reason: args.reason,
|
|
415
351
|
});
|
|
416
|
-
|
|
352
|
+
await fs_extra_1.default.mkdir((0, path_1.dirname)(args.filePath), { recursive: true });
|
|
353
|
+
await fs_extra_1.default.writeFile(args.filePath, args.code, "utf-8");
|
|
354
|
+
console.log(`Created file: ${args.filePath}`);
|
|
355
|
+
})();
|
|
356
|
+
}));
|
|
357
|
+
const strReplaceToolCalls = completion.tool_calls.filter((tc) => tc.function.name === types_1.CodeEditorToolCall.STR_REPLACE);
|
|
358
|
+
if (strReplaceToolCalls.length > 0) {
|
|
359
|
+
console.log(`str_replace tool calls: `, strReplaceToolCalls);
|
|
360
|
+
}
|
|
361
|
+
// Filter out the tool calls which are for replacing code in existing files
|
|
362
|
+
const fileChanges = strReplaceToolCalls
|
|
363
|
+
.map((toolCall) => (0, utils_1.parseJson)(toolCall.function.arguments))
|
|
364
|
+
.filter((f) => f.filePath && fs_extra_1.default.existsSync(f.filePath));
|
|
365
|
+
// We add all the suggested changes to the updatedFiles array
|
|
366
|
+
// This is used to validate and format files later
|
|
367
|
+
updatedFiles.push(...fileChanges);
|
|
368
|
+
// applyChangesResponse contains the errors occurred while applying the changes
|
|
369
|
+
const updates = await (0, utils_2.applyFileChangesUsingStrReplace)({
|
|
370
|
+
trace: codeEditorSpan,
|
|
371
|
+
fileChanges,
|
|
372
|
+
logger,
|
|
373
|
+
});
|
|
374
|
+
const failedCodeUpdates = updates.filter((f) => f?.error);
|
|
375
|
+
if (failedCodeUpdates.length > 0) {
|
|
376
|
+
logger?.log(`Failed to apply changes, retrying...`, failedCodeUpdates);
|
|
417
377
|
}
|
|
418
378
|
}
|
|
419
379
|
return deDupUpdatedFiles(updatedFiles);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"lexical-scoped-vars.d.ts","sourceRoot":"","sources":["../../../src/agent/codegen/lexical-scoped-vars.ts"],"names":[],"mappings":"AAAA,OAAO,EAAO,WAAW,EAAE,MAAM,mBAAmB,CAAC;
|
|
1
|
+
{"version":3,"file":"lexical-scoped-vars.d.ts","sourceRoot":"","sources":["../../../src/agent/codegen/lexical-scoped-vars.ts"],"names":[],"mappings":"AAAA,OAAO,EAAO,WAAW,EAAE,MAAM,mBAAmB,CAAC;AASrD,OAAO,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAEnD,wBAAsB,oBAAoB,CAAC,EACzC,KAAK,EACL,IAAI,EACJ,cAAc,EACd,OAAO,GACR,EAAE;IACD,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,IAAI,EAAE,MAAM,CAAC;IACb,cAAc,EAAE,MAAM,CAAC;IACvB,OAAO,CAAC,EAAE,oBAAoB,CAAC;CAChC,qBAoDA"}
|
|
@@ -4,12 +4,12 @@ exports.getLexicalScopedVars = void 0;
|
|
|
4
4
|
const llm_1 = require("@empiricalrun/llm");
|
|
5
5
|
const constants_1 = require("../../constants");
|
|
6
6
|
const promptTemplate_0 = "{{#section \"system\"}}\nYou are a software engineer tasked with analysing Typescript code to identify all variables available in the lexical\nscope at a specific reference point within a file. You will be given a file that contains multiple Playwright tests or\npage object models, along with a reference point inside the file. Your goal is to evaluate the list of all variables\navailable in the lexical scope at that reference point.\n\nTo accomplish this, you need to evaluate the Abstract Syntax Tree (AST) and accumulate all variables that are in the\nlexical scope, which includes:\n1. Variables declared within the test before the reference point.\n2. Arguments of the function.\n3. Variables defined in the parent scope. Identify all variables available in the lexical scope at a specific execution\nreference point within a file, considering only those variables that have been declared and assigned prior to the\nexecution of this point in the code.\n4. Global variables defined in the file.\n\nBefore responding:\n- Ignore variables imported from the `\"./pages\"` path.\n- keep in mind temporal dead zone phenomenon before responding with variables\n{{/section}}\n\n{{#section \"user\"}}\nFile:\n{{testFile}}\n\nReference point:\n{{referencePoint}}\n{{/section}}";
|
|
7
|
-
const
|
|
7
|
+
const lib_1 = require("../../prompts/lib");
|
|
8
8
|
async function getLexicalScopedVars({ trace, file, referencePoint, options, }) {
|
|
9
9
|
const fetchLexicalScopedVarsSpan = trace?.span({
|
|
10
10
|
name: "lexical-scoped-vars",
|
|
11
11
|
});
|
|
12
|
-
const messages = await (0,
|
|
12
|
+
const messages = await (0, lib_1.compilePrompt)(promptTemplate_0, {
|
|
13
13
|
testFile: file || "",
|
|
14
14
|
referencePoint: referencePoint || "",
|
|
15
15
|
});
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"element-annotation.d.ts","sourceRoot":"","sources":["../../../src/agent/master/element-annotation.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;
|
|
1
|
+
{"version":3,"file":"element-annotation.d.ts","sourceRoot":"","sources":["../../../src/agent/master/element-annotation.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AASlC,OAAO,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAEnD,OAAO,EAAE,UAAU,EAAE,MAAM,qBAAqB,CAAC;AA0DjD,wBAAsB,oBAAoB,CAAC,EACzC,kBAAkB,EAClB,WAAW,EACX,mBAAmB,EACnB,KAAK,EACL,GAAG,EACH,OAAO,EACP,UAAU,GACX,EAAE;IACD,kBAAkB,EAAE,MAAM,CAAC;IAC3B,WAAW,EAAE,MAAM,CAAC;IACpB,mBAAmB,EAAE,MAAM,CAAC;IAC5B,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,GAAG,CAAC,EAAE,GAAG,CAAC;IACV,OAAO,CAAC,EAAE,oBAAoB,CAAC;IAC/B,UAAU,EAAE,oBAAoB,CAAC;CAClC,GAAG,OAAO,CAAC,MAAM,GAAG,SAAS,CAAC,CA8C9B;AAED,MAAM,MAAM,oBAAoB,GAAG;IACjC,UAAU,EACN,KAAK,GACL,UAAU,CAAC,IAAI,GACf,UAAU,CAAC,WAAW,GACtB,UAAU,CAAC,MAAM,CAAC;IACtB,aAAa,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;CACpC,CAAC;AAEF,wBAAsB,iBAAiB,CAAC,EACtC,IAAI,EACJ,UAAU,EACV,OAAO,GACR,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,UAAU,EAAE,oBAAoB,CAAC;IACjC,OAAO,EAAE,oBAAoB,CAAC;CAC/B,GAAG,OAAO,CAAC;IACV,cAAc,EAAE;QAAE,SAAS,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE,EAAE,CAAC;IACtD,gBAAgB,EAAE,MAAM,CAAC;IACzB,uBAAuB,EAAE,MAAM,CAAC;CACjC,CAAC,CAqDD"}
|
|
@@ -2,8 +2,9 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.getAnnotationKeys = exports.getElementAnnotation = void 0;
|
|
4
4
|
const llm_1 = require("@empiricalrun/llm");
|
|
5
|
-
const vision_1 = require("@empiricalrun/llm/vision");
|
|
6
5
|
const constants_1 = require("../../constants");
|
|
6
|
+
const promptTemplate_0 = "{{#section \"system\"}}\nYou are an expert in describing the images and it's content. You need to provide the descriptions of annotated elements\npresent in the image.\n\nYou will be provided with an annotated screenshot where interact-able / clickable elements are annotated. The annotation\nis done by drawing a red box around the element and a small yellow box on it which contains unique element id.\n\nYou are given a Annotations which contains list of unique element id and description of the element separated by \":\".\n\nYou are also given the description of the element on which the action needs to be taken. The description includes\ninformation about how the element looks, it's position etc.\n\nYour task is to provide the annotation of the element on which the action needs to be performed based on the element\ndescription.\n\nFollow steps to fulfil your task:\n- Using the list of all element Ids provided to you, map all the element Ids on the annotated screen and describe each\nelement.\n- For describing each element Id\n-- iterate over each element Id in annotation list\n-- check if the description is already present for the element Id in the Annotation provided to you. If present skip\ndescribing it and use it as is.\n-- if the description is NA, then identify the element in the annotated screenshot and describe it using the image or\nicon enclosed in the element.\n- Respond with the mapped element Ids as \"enriched_annotations\"\n- Based on the description provided to you and the enriched annotations, first identify the element Id whose description\nmatches the task provided\n\nNote:\n- Ensure providing the description of all the elements in the list.\n- Don't update the description if its already present in the given annotations\n- Replace all the \"NA\" with description of the element. Its position, how does it look like etc.\n- There should be no \"NA\" present in any of the element description\n{{/section}}\n\n{{#section \"user\"}}\nElement description:\n{{elementDescription}}\n\nAnnotations:\n{{annotations}}\n\n{{image annotatedScreenshot}}\n{{/section}}";
|
|
7
|
+
const lib_1 = require("../../prompts/lib");
|
|
7
8
|
const utils_1 = require("../utils");
|
|
8
9
|
const annotationToolAction = {
|
|
9
10
|
name: "element_annotation",
|
|
@@ -66,58 +67,11 @@ async function getElementAnnotation({ elementDescription, annotations, annotated
|
|
|
66
67
|
preference,
|
|
67
68
|
},
|
|
68
69
|
});
|
|
69
|
-
const
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
You are given a Annotations which contains list of unique element id and description of the element separated by ":".
|
|
76
|
-
|
|
77
|
-
You are also given the description of the element on which the action needs to be taken. The description includes information about how the element looks, it's position etc.
|
|
78
|
-
|
|
79
|
-
Your task is to provide the annotation of the element on which the action needs to be performed based on the element description.
|
|
80
|
-
|
|
81
|
-
Follow steps to fulfil your task:
|
|
82
|
-
- Using the list of all element Ids provided to you, map all the element Ids on the annotated screen and describe each element.
|
|
83
|
-
- For describing each element Id
|
|
84
|
-
-- iterate over each element Id in annotation list
|
|
85
|
-
-- check if the description is already present for the element Id in the Annotation provided to you. If present skip describing it and use it as is.
|
|
86
|
-
-- if the description is NA, then identify the element in the annotated screenshot and describe it using the image or icon enclosed in the element.
|
|
87
|
-
- Respond with the mapped element Ids as "enriched_annotations"
|
|
88
|
-
- Based on the description provided to you and the enriched annotations, first identify the element Id whose description matches the task provided
|
|
89
|
-
|
|
90
|
-
Note:
|
|
91
|
-
- Ensure providing the description of all the elements in the list.
|
|
92
|
-
- Don't update the description if its already present in the given annotations
|
|
93
|
-
- Replace all the "NA" with description of the element. Its position, how does it look like etc.
|
|
94
|
-
- There should be no "NA" present in any of the element description
|
|
95
|
-
`,
|
|
96
|
-
};
|
|
97
|
-
const userMessage = {
|
|
98
|
-
role: "user",
|
|
99
|
-
content: [
|
|
100
|
-
{
|
|
101
|
-
type: "text",
|
|
102
|
-
text: `
|
|
103
|
-
Element description:
|
|
104
|
-
${elementDescription}
|
|
105
|
-
|
|
106
|
-
Annotations:
|
|
107
|
-
${annotations}`,
|
|
108
|
-
},
|
|
109
|
-
{
|
|
110
|
-
type: "image_url",
|
|
111
|
-
image_url: {
|
|
112
|
-
url: (0, vision_1.imageFormatForProvider)(options?.modelProvider || constants_1.DEFAULT_MODEL_PROVIDER, annotatedScreenshot),
|
|
113
|
-
},
|
|
114
|
-
},
|
|
115
|
-
],
|
|
116
|
-
};
|
|
117
|
-
const messages = [
|
|
118
|
-
systemMessage,
|
|
119
|
-
userMessage,
|
|
120
|
-
];
|
|
70
|
+
const messages = (0, lib_1.compilePrompt)(promptTemplate_0, {
|
|
71
|
+
elementDescription,
|
|
72
|
+
annotations,
|
|
73
|
+
annotatedScreenshot,
|
|
74
|
+
}, options);
|
|
121
75
|
llm =
|
|
122
76
|
llm ||
|
|
123
77
|
new llm_1.LLM({
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/agent/master/run.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAclC,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAClD,OAAO,EACL,oBAAoB,EAErB,MAAM,aAAa,CAAC;AA4BrB,wBAAsB,0BAA0B,CAAC,EAC/C,IAAI,EACJ,IAAI,EACJ,QAAQ,EACR,OAAO,EACP,SAAS,GACV,EAAE;IACD,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,IAAI,CAAC;IACX,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,OAAO,EAAE,oBAAoB,CAAC;IAC9B,SAAS,CAAC,EAAE,SAAS,CAAC;CACvB;;;
|
|
1
|
+
{"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/agent/master/run.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAclC,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAClD,OAAO,EACL,oBAAoB,EAErB,MAAM,aAAa,CAAC;AA4BrB,wBAAsB,0BAA0B,CAAC,EAC/C,IAAI,EACJ,IAAI,EACJ,QAAQ,EACR,OAAO,EACP,SAAS,GACV,EAAE;IACD,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,IAAI,CAAC;IACX,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,OAAO,EAAE,oBAAoB,CAAC;IAC9B,SAAS,CAAC,EAAE,SAAS,CAAC;CACvB;;;GAoVA"}
|
package/dist/agent/master/run.js
CHANGED
|
@@ -150,11 +150,11 @@ async function createTestUsingMasterAgent({ task, page, testCase, options, scope
|
|
|
150
150
|
const masterAgentActionSpan = masterAgentSpan?.span({
|
|
151
151
|
name: "master-agent-execute-action",
|
|
152
152
|
});
|
|
153
|
+
output = {
|
|
154
|
+
action: args.action || args.skill,
|
|
155
|
+
reason: args.reason,
|
|
156
|
+
};
|
|
153
157
|
try {
|
|
154
|
-
output = {
|
|
155
|
-
action: args.action || args.skill,
|
|
156
|
-
reason: args.reason,
|
|
157
|
-
};
|
|
158
158
|
void testGenReporter.sendMessage(output.reason);
|
|
159
159
|
logger.log(`Next Action: ${output.action}`);
|
|
160
160
|
if (toolCall.actionType === skill_1.SKILL_USAGE) {
|
|
@@ -254,7 +254,6 @@ async function createTestUsingMasterAgent({ task, page, testCase, options, scope
|
|
|
254
254
|
generatedCodeSteps = await (0, browsing_1.executeTaskUsingBrowsingAgent)({
|
|
255
255
|
trace: masterAgentActionSpan,
|
|
256
256
|
action: output.action,
|
|
257
|
-
logger,
|
|
258
257
|
page,
|
|
259
258
|
options,
|
|
260
259
|
llm,
|
|
@@ -307,8 +306,8 @@ async function createTestUsingMasterAgent({ task, page, testCase, options, scope
|
|
|
307
306
|
trace?.update({ input: { task }, output: { output } });
|
|
308
307
|
masterAgentSpan?.end({
|
|
309
308
|
output: {
|
|
310
|
-
action: output
|
|
311
|
-
reason: output
|
|
309
|
+
action: output?.action,
|
|
310
|
+
reason: output?.reason,
|
|
312
311
|
code: generatedCodeSteps,
|
|
313
312
|
},
|
|
314
313
|
});
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { LLMProvider } from "@empiricalrun/llm";
|
|
2
|
+
import OpenAI from "openai";
|
|
3
|
+
type PromptOptions = {
|
|
4
|
+
modelProvider?: LLMProvider;
|
|
5
|
+
};
|
|
6
|
+
export declare function compilePrompt<T extends object>(promptTemplate: string, params: T, options?: PromptOptions): OpenAI.Chat.Completions.ChatCompletionMessageParam[];
|
|
7
|
+
export {};
|
|
8
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/prompts/lib/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAGhD,OAAO,MAAM,MAAM,QAAQ,CAAC;AAyF5B,KAAK,aAAa,GAAG;IACnB,aAAa,CAAC,EAAE,WAAW,CAAC;CAC7B,CAAC;AAEF,wBAAgB,aAAa,CAAC,CAAC,SAAS,MAAM,EAC5C,cAAc,EAAE,MAAM,EACtB,MAAM,EAAE,CAAC,EACT,OAAO,CAAC,EAAE,aAAa,GACtB,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,0BAA0B,EAAE,CAwCtD"}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.compilePrompt = void 0;
|
|
7
|
+
const vision_1 = require("@empiricalrun/llm/vision");
|
|
8
|
+
const handlebars_1 = __importDefault(require("handlebars"));
|
|
9
|
+
const constants_1 = require("../../constants");
|
|
10
|
+
class SectionManager {
|
|
11
|
+
sections = {};
|
|
12
|
+
getSection(name) {
|
|
13
|
+
return this.sections[name] || "";
|
|
14
|
+
}
|
|
15
|
+
setSection(name, content) {
|
|
16
|
+
this.sections[name] = content;
|
|
17
|
+
}
|
|
18
|
+
getAllSections() {
|
|
19
|
+
return this.sections;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
const IMAGE_TOKEN_PREFIX = "[[[HANDLEBARS_IMAGE:";
|
|
23
|
+
const IMAGE_TOKEN_SUFFIX = "]]]";
|
|
24
|
+
function createHandlebarsEnv() {
|
|
25
|
+
const HandlebarsEnv = handlebars_1.default.create();
|
|
26
|
+
const sectionManager = new SectionManager();
|
|
27
|
+
HandlebarsEnv.registerHelper("section", function (name, options) {
|
|
28
|
+
const content = options.fn(this);
|
|
29
|
+
sectionManager.setSection(name, content);
|
|
30
|
+
return ""; // don't output anything in place
|
|
31
|
+
});
|
|
32
|
+
HandlebarsEnv.registerHelper("image", function (imageParam) {
|
|
33
|
+
const tokenPayload = JSON.stringify({ url: imageParam });
|
|
34
|
+
// Use encodeURIComponent to avoid conflicts with special characters.
|
|
35
|
+
const token = `${IMAGE_TOKEN_PREFIX}${encodeURIComponent(tokenPayload)}${IMAGE_TOKEN_SUFFIX}`;
|
|
36
|
+
return token;
|
|
37
|
+
});
|
|
38
|
+
HandlebarsEnv.registerHelper("images", function (imagesParam) {
|
|
39
|
+
if (!Array.isArray(imagesParam))
|
|
40
|
+
return "";
|
|
41
|
+
return imagesParam
|
|
42
|
+
.map((url) => {
|
|
43
|
+
const tokenPayload = JSON.stringify({ url });
|
|
44
|
+
return `${IMAGE_TOKEN_PREFIX}${encodeURIComponent(tokenPayload)}${IMAGE_TOKEN_SUFFIX}`;
|
|
45
|
+
})
|
|
46
|
+
.join("");
|
|
47
|
+
});
|
|
48
|
+
return { HandlebarsEnv, sectionManager };
|
|
49
|
+
}
|
|
50
|
+
function processSectionContent(content) {
|
|
51
|
+
if (!content.includes(IMAGE_TOKEN_PREFIX)) {
|
|
52
|
+
return content.trim();
|
|
53
|
+
}
|
|
54
|
+
const segments = [];
|
|
55
|
+
const regex = /\[\[\[HANDLEBARS_IMAGE:(.*?)\]\]\]/g;
|
|
56
|
+
let lastIndex = 0;
|
|
57
|
+
let match;
|
|
58
|
+
while ((match = regex.exec(content)) !== null) {
|
|
59
|
+
// Get the text before the token.
|
|
60
|
+
const textPart = content.slice(lastIndex, match.index).trim();
|
|
61
|
+
if (textPart) {
|
|
62
|
+
segments.push({ type: "text", text: textPart });
|
|
63
|
+
}
|
|
64
|
+
// Decode the token payload.
|
|
65
|
+
try {
|
|
66
|
+
const payloadJson = decodeURIComponent(match[1]);
|
|
67
|
+
const payload = JSON.parse(payloadJson);
|
|
68
|
+
segments.push({ type: "image_url", image_url: { url: payload.url } });
|
|
69
|
+
}
|
|
70
|
+
catch (err) {
|
|
71
|
+
// If decoding/parsing fails, treat the token as literal text.
|
|
72
|
+
segments.push({ type: "text", text: match[0] });
|
|
73
|
+
}
|
|
74
|
+
lastIndex = match.index + match[0].length;
|
|
75
|
+
}
|
|
76
|
+
const remaining = content.slice(lastIndex).trim();
|
|
77
|
+
if (remaining) {
|
|
78
|
+
segments.push({ type: "text", text: remaining });
|
|
79
|
+
}
|
|
80
|
+
return segments;
|
|
81
|
+
}
|
|
82
|
+
function compilePrompt(promptTemplate, params, options) {
|
|
83
|
+
const { HandlebarsEnv, sectionManager } = createHandlebarsEnv();
|
|
84
|
+
const template = HandlebarsEnv.compile(promptTemplate, { noEscape: true });
|
|
85
|
+
template(params);
|
|
86
|
+
const sections = sectionManager.getAllSections();
|
|
87
|
+
// TODO: system cannot have images, we can add validation for that
|
|
88
|
+
const system = sections["system"];
|
|
89
|
+
const user = sections["user"];
|
|
90
|
+
if (!system || !user) {
|
|
91
|
+
// TODO: support templates that have only one section
|
|
92
|
+
throw new Error("Both system and user sections must be defined in the template");
|
|
93
|
+
}
|
|
94
|
+
const systemContent = processSectionContent(system);
|
|
95
|
+
const userContent = processSectionContent(user);
|
|
96
|
+
let userContentCorrectedForImageFormat = userContent;
|
|
97
|
+
if (Array.isArray(userContent)) {
|
|
98
|
+
const provider = options?.modelProvider || constants_1.DEFAULT_MODEL_PROVIDER;
|
|
99
|
+
userContentCorrectedForImageFormat = userContent.map((c) => {
|
|
100
|
+
if (c.type === "image_url") {
|
|
101
|
+
return {
|
|
102
|
+
...c,
|
|
103
|
+
image_url: {
|
|
104
|
+
url: (0, vision_1.imageFormatForProvider)(provider, c.image_url.url),
|
|
105
|
+
},
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
else {
|
|
109
|
+
return c;
|
|
110
|
+
}
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
return [
|
|
114
|
+
{ role: "system", content: systemContent },
|
|
115
|
+
{ role: "user", content: userContentCorrectedForImageFormat },
|
|
116
|
+
];
|
|
117
|
+
}
|
|
118
|
+
exports.compilePrompt = compilePrompt;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@empiricalrun/test-gen",
|
|
3
|
-
"version": "0.42.
|
|
3
|
+
"version": "0.42.19",
|
|
4
4
|
"publishConfig": {
|
|
5
5
|
"registry": "https://registry.npmjs.org/",
|
|
6
6
|
"access": "public"
|
|
@@ -72,7 +72,7 @@
|
|
|
72
72
|
"ts-morph": "^23.0.0",
|
|
73
73
|
"tsx": "^4.16.2",
|
|
74
74
|
"typescript": "^5.3.3",
|
|
75
|
-
"@empiricalrun/llm": "^0.9.
|
|
75
|
+
"@empiricalrun/llm": "^0.9.32",
|
|
76
76
|
"@empiricalrun/r2-uploader": "^0.3.8",
|
|
77
77
|
"@empiricalrun/reporter": "^0.23.1"
|
|
78
78
|
},
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"promptBuilder.d.ts","sourceRoot":"","sources":["../../../src/agent/codegen/promptBuilder.ts"],"names":[],"mappings":"AACA,OAAO,MAAM,MAAM,QAAQ,CAAC;AA6B5B,wBAAsB,aAAa,CAAC,CAAC,SAAS,MAAM,EAClD,cAAc,EAAE,MAAM,EACtB,MAAM,EAAE,CAAC,GACR,OAAO,CAAC,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,0BAA0B,EAAE,CAAC,CAe/D"}
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
-
};
|
|
5
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
-
exports.compilePrompt = void 0;
|
|
7
|
-
const handlebars_1 = __importDefault(require("handlebars"));
|
|
8
|
-
class SectionManager {
|
|
9
|
-
sections = {};
|
|
10
|
-
getSection(name) {
|
|
11
|
-
return this.sections[name] || "";
|
|
12
|
-
}
|
|
13
|
-
setSection(name, content) {
|
|
14
|
-
this.sections[name] = content;
|
|
15
|
-
}
|
|
16
|
-
getAllSections() {
|
|
17
|
-
return this.sections;
|
|
18
|
-
}
|
|
19
|
-
}
|
|
20
|
-
function createHandlebarsEnv() {
|
|
21
|
-
const HandlebarsEnv = handlebars_1.default.create();
|
|
22
|
-
const sectionManager = new SectionManager();
|
|
23
|
-
HandlebarsEnv.registerHelper("section", function (name, options) {
|
|
24
|
-
const content = options.fn(this);
|
|
25
|
-
sectionManager.setSection(name, content);
|
|
26
|
-
return ""; // Don't output anything in place
|
|
27
|
-
});
|
|
28
|
-
return { HandlebarsEnv, sectionManager };
|
|
29
|
-
}
|
|
30
|
-
async function compilePrompt(promptTemplate, params) {
|
|
31
|
-
const { HandlebarsEnv, sectionManager } = createHandlebarsEnv();
|
|
32
|
-
const template = HandlebarsEnv.compile(promptTemplate, { noEscape: true });
|
|
33
|
-
template(params);
|
|
34
|
-
const { system, user } = sectionManager.getAllSections();
|
|
35
|
-
if (!system || !user) {
|
|
36
|
-
// TODO: support templates that have only one section
|
|
37
|
-
throw new Error("Both system and user sections must be defined in the template");
|
|
38
|
-
}
|
|
39
|
-
return [
|
|
40
|
-
{ role: "system", content: system },
|
|
41
|
-
{ role: "user", content: user },
|
|
42
|
-
];
|
|
43
|
-
}
|
|
44
|
-
exports.compilePrompt = compilePrompt;
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
import { TraceClient } from "@empiricalrun/llm";
|
|
2
|
-
/**
|
|
3
|
-
* This agent is used to verify whether the task is done basis the conversation history
|
|
4
|
-
*/
|
|
5
|
-
export declare function verificationAgent({ trace, task, conversation, }: {
|
|
6
|
-
trace?: TraceClient;
|
|
7
|
-
conversation: string[];
|
|
8
|
-
task: string;
|
|
9
|
-
}): Promise<{
|
|
10
|
-
isDone: boolean;
|
|
11
|
-
reason: string;
|
|
12
|
-
}>;
|
|
13
|
-
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/verification/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAkB,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAIhE;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,EACtC,KAAK,EACL,IAAI,EACJ,YAAY,GACb,EAAE;IACD,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,IAAI,EAAE,MAAM,CAAC;CACd;;;GA+EA"}
|
|
@@ -1,84 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.verificationAgent = void 0;
|
|
4
|
-
const llm_1 = require("@empiricalrun/llm");
|
|
5
|
-
const utils_1 = require("../utils");
|
|
6
|
-
/**
|
|
7
|
-
* This agent is used to verify whether the task is done basis the conversation history
|
|
8
|
-
*/
|
|
9
|
-
async function verificationAgent({ trace, task, conversation, }) {
|
|
10
|
-
const verificationAgentSpan = trace?.span({
|
|
11
|
-
name: "verification-agent",
|
|
12
|
-
input: {
|
|
13
|
-
task,
|
|
14
|
-
conversation,
|
|
15
|
-
},
|
|
16
|
-
});
|
|
17
|
-
const messages = await (0, llm_1.getPrompt)("agent-steps-verification", {
|
|
18
|
-
task,
|
|
19
|
-
conversation: conversation.join("\n"),
|
|
20
|
-
}, 5);
|
|
21
|
-
const llm = new llm_1.LLM({ provider: "openai" });
|
|
22
|
-
const response = await llm.createChatCompletion({
|
|
23
|
-
trace: verificationAgentSpan,
|
|
24
|
-
traceName: "verification-agent-llm",
|
|
25
|
-
model: "gpt-4o",
|
|
26
|
-
messages,
|
|
27
|
-
tools: [
|
|
28
|
-
{
|
|
29
|
-
type: "function",
|
|
30
|
-
function: {
|
|
31
|
-
name: "task_done",
|
|
32
|
-
description: "end the task by calling this method",
|
|
33
|
-
parameters: {
|
|
34
|
-
type: "object",
|
|
35
|
-
properties: {
|
|
36
|
-
actions: {
|
|
37
|
-
type: "string",
|
|
38
|
-
description: "actions extracted from task",
|
|
39
|
-
},
|
|
40
|
-
successful_actions: {
|
|
41
|
-
type: "string",
|
|
42
|
-
description: "successful actions mentioned in the conversation",
|
|
43
|
-
},
|
|
44
|
-
reason: {
|
|
45
|
-
type: "string",
|
|
46
|
-
description: "reasoning for identification of task status",
|
|
47
|
-
},
|
|
48
|
-
isDone: {
|
|
49
|
-
type: "boolean",
|
|
50
|
-
description: "whether the task is done",
|
|
51
|
-
},
|
|
52
|
-
},
|
|
53
|
-
required: ["isDone", "reason"],
|
|
54
|
-
},
|
|
55
|
-
},
|
|
56
|
-
},
|
|
57
|
-
],
|
|
58
|
-
modelParameters: {
|
|
59
|
-
tool_choice: "required",
|
|
60
|
-
temperature: 0.5,
|
|
61
|
-
},
|
|
62
|
-
});
|
|
63
|
-
const toolCallResp = (response?.tool_calls || [])[0];
|
|
64
|
-
if (toolCallResp) {
|
|
65
|
-
const toolCall = (0, utils_1.parseJson)(toolCallResp.function.arguments);
|
|
66
|
-
const output = {
|
|
67
|
-
isDone: toolCall.isDone,
|
|
68
|
-
reason: toolCall.reason,
|
|
69
|
-
};
|
|
70
|
-
verificationAgentSpan?.end({
|
|
71
|
-
output,
|
|
72
|
-
});
|
|
73
|
-
return output;
|
|
74
|
-
}
|
|
75
|
-
const output = {
|
|
76
|
-
isDone: false,
|
|
77
|
-
reason: "LLM failed to generate a valid response",
|
|
78
|
-
};
|
|
79
|
-
verificationAgentSpan?.end({
|
|
80
|
-
output,
|
|
81
|
-
});
|
|
82
|
-
return output;
|
|
83
|
-
}
|
|
84
|
-
exports.verificationAgent = verificationAgent;
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"verification-agent.evals.d.ts","sourceRoot":"","sources":["../../src/evals/verification-agent.evals.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AAEpC,eAAO,MAAM,qBAAqB,EAAE,UAgBnC,CAAC;AAEF,eAAe,qBAAqB,CAAC"}
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.verifierAgentEvaluate = void 0;
|
|
4
|
-
const verification_1 = require("../agent/verification");
|
|
5
|
-
const verifierAgentEvaluate = async ({ item, trace }) => {
|
|
6
|
-
const { conversation = [], task = "" } = item.input;
|
|
7
|
-
const output = await (0, verification_1.verificationAgent)({
|
|
8
|
-
conversation,
|
|
9
|
-
trace,
|
|
10
|
-
task,
|
|
11
|
-
});
|
|
12
|
-
return {
|
|
13
|
-
scores: [
|
|
14
|
-
{
|
|
15
|
-
name: "equality",
|
|
16
|
-
value: item.expectedOutput.isDone === output.isDone ? 1 : 0,
|
|
17
|
-
},
|
|
18
|
-
],
|
|
19
|
-
output,
|
|
20
|
-
};
|
|
21
|
-
};
|
|
22
|
-
exports.verifierAgentEvaluate = verifierAgentEvaluate;
|
|
23
|
-
exports.default = exports.verifierAgentEvaluate;
|