@empiricalrun/test-gen 0.42.29 → 0.43.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +11 -0
- package/dist/agent/diagnosis-agent/index.d.ts +18 -0
- package/dist/agent/diagnosis-agent/index.d.ts.map +1 -0
- package/dist/agent/diagnosis-agent/index.js +105 -0
- package/dist/agent/diagnosis-agent/strict-mode-violation.d.ts +9 -0
- package/dist/agent/diagnosis-agent/strict-mode-violation.d.ts.map +1 -0
- package/dist/agent/diagnosis-agent/strict-mode-violation.js +31 -0
- package/dist/bin/index.js +15 -0
- package/package.json +4 -4
package/CHANGELOG.md
CHANGED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import { TraceClient } from "@empiricalrun/llm";
|
|
2
|
+
import { TestErrorDiagnosisDetails, TestGenConfigOptions } from "@empiricalrun/shared-types";
|
|
3
|
+
import { CustomLogger } from "../../bin/logger";
|
|
4
|
+
/**
|
|
5
|
+
*
|
|
6
|
+
* inputs
|
|
7
|
+
* - task
|
|
8
|
+
* - diagnosis
|
|
9
|
+
*/
|
|
10
|
+
export declare function createTaskUsingFailureDiagnosis({ options, trace, diagnosis, logger, }: {
|
|
11
|
+
options?: TestGenConfigOptions;
|
|
12
|
+
trace?: TraceClient;
|
|
13
|
+
diagnosis: TestErrorDiagnosisDetails;
|
|
14
|
+
logger?: CustomLogger;
|
|
15
|
+
}): Promise<{
|
|
16
|
+
task: string;
|
|
17
|
+
}>;
|
|
18
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/diagnosis-agent/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAyB,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACvE,OAAO,EACL,yBAAyB,EACzB,oBAAoB,EACrB,MAAM,4BAA4B,CAAC;AAEpC,OAAO,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAuChD;;;;;GAKG;AACH,wBAAsB,+BAA+B,CAAC,EACpD,OAAO,EACP,KAAK,EACL,SAAS,EACT,MAAM,GACP,EAAE;IACD,OAAO,CAAC,EAAE,oBAAoB,CAAC;IAC/B,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,SAAS,EAAE,yBAAyB,CAAC;IACrC,MAAM,CAAC,EAAE,YAAY,CAAC;CACvB,GAAG,OAAO,CAAC;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,CA8E5B"}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.createTaskUsingFailureDiagnosis = void 0;
|
|
4
|
+
const llm_1 = require("@empiricalrun/llm");
|
|
5
|
+
const session_1 = require("../../session");
|
|
6
|
+
const strict_mode_violation_1 = require("./strict-mode-violation");
|
|
7
|
+
const session = (0, session_1.getSessionDetails)();
|
|
8
|
+
const responseFormat = {
|
|
9
|
+
type: "json_schema",
|
|
10
|
+
json_schema: {
|
|
11
|
+
name: "test-case-auto-fix-summary",
|
|
12
|
+
strict: true,
|
|
13
|
+
schema: {
|
|
14
|
+
type: "object",
|
|
15
|
+
properties: {
|
|
16
|
+
observation: {
|
|
17
|
+
type: "array",
|
|
18
|
+
items: {
|
|
19
|
+
type: "string",
|
|
20
|
+
},
|
|
21
|
+
description: "Detailed observation of what changed between successful and failed test screenshots",
|
|
22
|
+
},
|
|
23
|
+
action: {
|
|
24
|
+
type: "string",
|
|
25
|
+
description: "Direct action to fix the test in natural language without code snippets or options",
|
|
26
|
+
},
|
|
27
|
+
},
|
|
28
|
+
required: ["observation", "action"],
|
|
29
|
+
additionalProperties: false,
|
|
30
|
+
},
|
|
31
|
+
},
|
|
32
|
+
};
|
|
33
|
+
/**
|
|
34
|
+
*
|
|
35
|
+
* inputs
|
|
36
|
+
* - task
|
|
37
|
+
* - diagnosis
|
|
38
|
+
*/
|
|
39
|
+
async function createTaskUsingFailureDiagnosis({ options, trace, diagnosis, logger, }) {
|
|
40
|
+
trace =
|
|
41
|
+
trace ||
|
|
42
|
+
llm_1.langfuseInstance?.trace({
|
|
43
|
+
name: "infer-agent-task",
|
|
44
|
+
id: crypto.randomUUID(),
|
|
45
|
+
release: session.version,
|
|
46
|
+
});
|
|
47
|
+
const failureDiagnosisSpan = trace?.span({
|
|
48
|
+
name: "auto-fix",
|
|
49
|
+
input: {
|
|
50
|
+
diagnosisId: diagnosis.diagnosisId,
|
|
51
|
+
prjRepoName: options?.metadata.projectRepoName,
|
|
52
|
+
},
|
|
53
|
+
});
|
|
54
|
+
logger?.log("Trying to fix the test using failure diagnosis. Fetching key moments of the diagnosis");
|
|
55
|
+
const resp = await fetch(diagnosis.keyMomentsUrl);
|
|
56
|
+
// TODO: check for response to be not ok
|
|
57
|
+
if (resp.ok) {
|
|
58
|
+
logger?.success("Successfully fetched key moments of the diagnosis");
|
|
59
|
+
}
|
|
60
|
+
else {
|
|
61
|
+
logger?.warn("Failed to fetch key moments of the diagnosis");
|
|
62
|
+
}
|
|
63
|
+
const screenshotsData = await resp.json();
|
|
64
|
+
const llm = new llm_1.LLM({
|
|
65
|
+
provider: "openai",
|
|
66
|
+
defaultModel: "o1",
|
|
67
|
+
trace,
|
|
68
|
+
});
|
|
69
|
+
// TODO: make this dynamic in nature. the prompts should be made receipe
|
|
70
|
+
// which will help to get rid of if else logic
|
|
71
|
+
// receipe to have:
|
|
72
|
+
// 1. selection criteria
|
|
73
|
+
// 2. job to be done - in this case generate a prompt
|
|
74
|
+
let prompt;
|
|
75
|
+
if (diagnosis.failed_run_metadata.stack.includes("strict mode violation")) {
|
|
76
|
+
prompt = (0, strict_mode_violation_1.fixStrictModeViolationPrompt)({
|
|
77
|
+
screenshotsData,
|
|
78
|
+
diagnosis,
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
if (prompt) {
|
|
82
|
+
const llmResponse = await llm.createChatCompletion({
|
|
83
|
+
messages: prompt,
|
|
84
|
+
modelParameters: {
|
|
85
|
+
max_completion_tokens: 40000,
|
|
86
|
+
},
|
|
87
|
+
responseFormat,
|
|
88
|
+
});
|
|
89
|
+
const { observation, action } = JSON.parse(llmResponse?.content);
|
|
90
|
+
failureDiagnosisSpan?.update({
|
|
91
|
+
output: {
|
|
92
|
+
observation,
|
|
93
|
+
action,
|
|
94
|
+
},
|
|
95
|
+
});
|
|
96
|
+
return {
|
|
97
|
+
task: action,
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
// TODO: handle default prompt
|
|
101
|
+
return {
|
|
102
|
+
task: "",
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
exports.createTaskUsingFailureDiagnosis = createTaskUsingFailureDiagnosis;
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { TestErrorDiagnosisDetails } from "@empiricalrun/shared-types";
|
|
2
|
+
export declare function fixStrictModeViolationPrompt({ screenshotsData, diagnosis, }: {
|
|
3
|
+
screenshotsData: {
|
|
4
|
+
success: string[];
|
|
5
|
+
failure: string[];
|
|
6
|
+
};
|
|
7
|
+
diagnosis: TestErrorDiagnosisDetails;
|
|
8
|
+
}): import("openai/resources/index.mjs").ChatCompletionMessageParam[];
|
|
9
|
+
//# sourceMappingURL=strict-mode-violation.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"strict-mode-violation.d.ts","sourceRoot":"","sources":["../../../src/agent/diagnosis-agent/strict-mode-violation.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAoBvE,wBAAgB,4BAA4B,CAAC,EAC3C,eAAe,EACf,SAAS,GACV,EAAE;IACD,eAAe,EAAE;QAAE,OAAO,EAAE,MAAM,EAAE,CAAC;QAAC,OAAO,EAAE,MAAM,EAAE,CAAA;KAAE,CAAC;IAC1D,SAAS,EAAE,yBAAyB,CAAC;CACtC,qEAiBA"}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.fixStrictModeViolationPrompt = void 0;
|
|
4
|
+
const llm_1 = require("@empiricalrun/llm");
|
|
5
|
+
const promptTemplate_0 = "{{#section \"system\"}}\nAs a software engineer, your task is to identify a fix for a failing Playwright test by analyzing screenshots of both the failed and successful test steps.\n\nKey issues for test failures include:\n- Duplicate elements for the same Playwright selector.\n\nInstructions:\n1. Examine the provided successful and failed test screenshots.\n2. Identify the correct element for action based on these observations on the successful test run screenshots.\n3. Sometimes the exact same locator is not available on failed test run screenshot, you need to identify the intent from successful test screenshots and apply that intent in failed test run screenshot to identify the right locator to interact with\n4. Evaluate the playwright selector options provided to you to execute the action. Pick the selector which best matches the intent of the test.\n5. Propose a precise action that addresses the issue.\n\nExample:\n- observation: \n - Current step failure: await page.getByText(\"Audience\").click()\n - Two similar buttons named \"Audience\" exist in the failed run screenshots\n - The successful test run clicked on \"Untracked Audience\"\n - The failed test run should click on \"Untracked Audience\"\n - Available locators: await page.getByText(\"Untracked Audience\").click() contain the selector for Untracked Audience\n- action: Replace failing line with await page.getByText(\"Untracked Audience\").click()\n\nYour action should:\n- Be directly actionable and free of ambiguity, as it will guide another LLM to generate code.\n- Be in natural language and not just code snippet.\n- Be verified as feasible on the failure screen before responding.\n- Choose from the provided possible actions that can be executed on the failure screen.\n- Action should adhere to the format mentioned in the example, i.e. it should start with \"Replace the failing line\" and the updated code with replaced selector following it.\n\nEnsure the action is executable based on the failure screen context before providing it.\n{{/section}}\n\n{{#section \"user\"}}\nSuccessful test screenshots\n\n{{images successScreenshots}}\n\nFailed test screenshots\n\n{{images failedScreenshots}}\n\nStep where test failed:\n{{failingLine}}\n\nOptions for Playwright selectors to perform actions on a failed test screen:\n{{selectorOptions}}\n\n{{/section}}\n\n";
|
|
6
|
+
function extractLocatorOptions(errorStack) {
|
|
7
|
+
// This regex matches a chain of locator API calls following the pattern:
|
|
8
|
+
// functionName(arguments) optionally chained with .functionName(arguments)
|
|
9
|
+
const regex = /aka\s+((?:[A-Za-z0-9_]+\([^)]*\)(?:\.[A-Za-z0-9_]+\([^)]*\))*))/g;
|
|
10
|
+
const options = [];
|
|
11
|
+
let match;
|
|
12
|
+
while ((match = regex.exec(errorStack)) !== null) {
|
|
13
|
+
if (match[1]) {
|
|
14
|
+
options.push(match[1]);
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
return options;
|
|
18
|
+
}
|
|
19
|
+
function fixStrictModeViolationPrompt({ screenshotsData, diagnosis, }) {
|
|
20
|
+
const compiledPrompt = (0, llm_1.compilePrompt)(promptTemplate_0, {
|
|
21
|
+
failingLine: diagnosis.failingLine,
|
|
22
|
+
successScreenshots: screenshotsData.success,
|
|
23
|
+
failedScreenshots: screenshotsData.failure,
|
|
24
|
+
selectorOptions: extractLocatorOptions(diagnosis.failed_run_metadata.stack).join("\n"),
|
|
25
|
+
}, {
|
|
26
|
+
imageDetail: "high",
|
|
27
|
+
modelProvider: "openai",
|
|
28
|
+
});
|
|
29
|
+
return compiledPrompt;
|
|
30
|
+
}
|
|
31
|
+
exports.fixStrictModeViolationPrompt = fixStrictModeViolationPrompt;
|
package/dist/bin/index.js
CHANGED
|
@@ -35,6 +35,7 @@ const run_1 = require("../agent/browsing/run");
|
|
|
35
35
|
const utils_1 = require("../agent/browsing/utils");
|
|
36
36
|
const repo_edit_1 = require("../agent/codegen/repo-edit");
|
|
37
37
|
const run_2 = require("../agent/codegen/run");
|
|
38
|
+
const diagnosis_agent_1 = require("../agent/diagnosis-agent");
|
|
38
39
|
const enrich_prompt_1 = require("../agent/enrich-prompt");
|
|
39
40
|
const infer_agent_1 = require("../agent/infer-agent");
|
|
40
41
|
const run_3 = require("../agent/planner/run");
|
|
@@ -112,6 +113,20 @@ async function runAgent(testGenConfig, span) {
|
|
|
112
113
|
});
|
|
113
114
|
return;
|
|
114
115
|
}
|
|
116
|
+
// TODO: this needs to be moved to an orchestrator which decides what needs to be done first before executing the sub tasks
|
|
117
|
+
if (testGenConfig.testErrorDiagnosis &&
|
|
118
|
+
testGenConfig.testErrorDiagnosis.failingLine &&
|
|
119
|
+
// TODO: fix this hardcoding of user prompt - ideally its an auto fix intent
|
|
120
|
+
testCase.steps[0] == "Can you please fix the test") {
|
|
121
|
+
const { task: updatedTask } = await (0, diagnosis_agent_1.createTaskUsingFailureDiagnosis)({
|
|
122
|
+
options: testGenConfig.options,
|
|
123
|
+
trace,
|
|
124
|
+
diagnosis: testGenConfig.testErrorDiagnosis,
|
|
125
|
+
});
|
|
126
|
+
if (updatedTask) {
|
|
127
|
+
testCase.steps = [updatedTask];
|
|
128
|
+
}
|
|
129
|
+
}
|
|
115
130
|
if (!agent || agent === "auto") {
|
|
116
131
|
agent = await resolveAgentUsingTask({
|
|
117
132
|
testCase,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@empiricalrun/test-gen",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.43.0",
|
|
4
4
|
"publishConfig": {
|
|
5
5
|
"registry": "https://registry.npmjs.org/",
|
|
6
6
|
"access": "public"
|
|
@@ -72,9 +72,9 @@
|
|
|
72
72
|
"ts-morph": "^23.0.0",
|
|
73
73
|
"tsx": "^4.16.2",
|
|
74
74
|
"typescript": "^5.3.3",
|
|
75
|
-
"@empiricalrun/
|
|
75
|
+
"@empiricalrun/llm": "^0.9.35",
|
|
76
76
|
"@empiricalrun/r2-uploader": "^0.3.8",
|
|
77
|
-
"@empiricalrun/
|
|
77
|
+
"@empiricalrun/reporter": "^0.23.1"
|
|
78
78
|
},
|
|
79
79
|
"devDependencies": {
|
|
80
80
|
"@playwright/test": "1.47.1",
|
|
@@ -89,7 +89,7 @@
|
|
|
89
89
|
"js-levenshtein": "^1.1.6",
|
|
90
90
|
"playwright": "1.47.1",
|
|
91
91
|
"ts-patch": "^3.3.0",
|
|
92
|
-
"@empiricalrun/shared-types": "0.0.
|
|
92
|
+
"@empiricalrun/shared-types": "0.0.4"
|
|
93
93
|
},
|
|
94
94
|
"scripts": {
|
|
95
95
|
"dev": "tspc --build --watch",
|