@empiricalrun/test-gen 0.42.29 → 0.43.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,22 @@
1
1
  # @empiricalrun/test-gen
2
2
 
3
+ ## 0.43.1
4
+
5
+ ### Patch Changes
6
+
7
+ - 64f275a: feat: add `--token` arg name to cli interface
8
+
9
+ ## 0.43.0
10
+
11
+ ### Minor Changes
12
+
13
+ - 46c0dab: feat: autofix workflow v1
14
+
15
+ ### Patch Changes
16
+
17
+ - Updated dependencies [46c0dab]
18
+ - @empiricalrun/llm@0.9.35
19
+
3
20
  ## 0.42.29
4
21
 
5
22
  ### Patch Changes
package/README.md CHANGED
@@ -8,13 +8,13 @@ Our agents that generate Playwright tests. There are 2 agents
8
8
  ## Usage
9
9
 
10
10
  ```sh
11
- npx @empiricalrun/test-gen TEST_GEN_TOKEN
11
+ npx @empiricalrun/test-gen --token TEST_GEN_TOKEN
12
12
  ```
13
13
 
14
14
  ### Add new test
15
15
 
16
16
  ```sh
17
- npx @empiricalrun/test-gen TEST_GEN_TOKEN
17
+ npx @empiricalrun/test-gen --token TEST_GEN_TOKEN
18
18
  ```
19
19
 
20
20
  - This will trigger browsing agent to write a new test for this scenario
@@ -23,7 +23,7 @@ npx @empiricalrun/test-gen TEST_GEN_TOKEN
23
23
  ### Update existing test
24
24
 
25
25
  ```sh
26
- npx @empiricalrun/test-gen TEST_GEN_TOKEN
26
+ npx @empiricalrun/test-gen --token TEST_GEN_TOKEN
27
27
  ```
28
28
 
29
29
  - If the test case is already present in the file, the test gen agent will update the existing test as per the steps provided in the payload
@@ -0,0 +1,18 @@
1
+ import { TraceClient } from "@empiricalrun/llm";
2
+ import { TestErrorDiagnosisDetails, TestGenConfigOptions } from "@empiricalrun/shared-types";
3
+ import { CustomLogger } from "../../bin/logger";
4
+ /**
5
+ *
6
+ * inputs
7
+ * - task
8
+ * - diagnosis
9
+ */
10
+ export declare function createTaskUsingFailureDiagnosis({ options, trace, diagnosis, logger, }: {
11
+ options?: TestGenConfigOptions;
12
+ trace?: TraceClient;
13
+ diagnosis: TestErrorDiagnosisDetails;
14
+ logger?: CustomLogger;
15
+ }): Promise<{
16
+ task: string;
17
+ }>;
18
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/diagnosis-agent/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAyB,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACvE,OAAO,EACL,yBAAyB,EACzB,oBAAoB,EACrB,MAAM,4BAA4B,CAAC;AAEpC,OAAO,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAuChD;;;;;GAKG;AACH,wBAAsB,+BAA+B,CAAC,EACpD,OAAO,EACP,KAAK,EACL,SAAS,EACT,MAAM,GACP,EAAE;IACD,OAAO,CAAC,EAAE,oBAAoB,CAAC;IAC/B,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,SAAS,EAAE,yBAAyB,CAAC;IACrC,MAAM,CAAC,EAAE,YAAY,CAAC;CACvB,GAAG,OAAO,CAAC;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,CA8E5B"}
@@ -0,0 +1,105 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.createTaskUsingFailureDiagnosis = void 0;
4
+ const llm_1 = require("@empiricalrun/llm");
5
+ const session_1 = require("../../session");
6
+ const strict_mode_violation_1 = require("./strict-mode-violation");
7
+ const session = (0, session_1.getSessionDetails)();
8
+ const responseFormat = {
9
+ type: "json_schema",
10
+ json_schema: {
11
+ name: "test-case-auto-fix-summary",
12
+ strict: true,
13
+ schema: {
14
+ type: "object",
15
+ properties: {
16
+ observation: {
17
+ type: "array",
18
+ items: {
19
+ type: "string",
20
+ },
21
+ description: "Detailed observation of what changed between successful and failed test screenshots",
22
+ },
23
+ action: {
24
+ type: "string",
25
+ description: "Direct action to fix the test in natural language without code snippets or options",
26
+ },
27
+ },
28
+ required: ["observation", "action"],
29
+ additionalProperties: false,
30
+ },
31
+ },
32
+ };
33
+ /**
34
+ *
35
+ * inputs
36
+ * - task
37
+ * - diagnosis
38
+ */
39
+ async function createTaskUsingFailureDiagnosis({ options, trace, diagnosis, logger, }) {
40
+ trace =
41
+ trace ||
42
+ llm_1.langfuseInstance?.trace({
43
+ name: "infer-agent-task",
44
+ id: crypto.randomUUID(),
45
+ release: session.version,
46
+ });
47
+ const failureDiagnosisSpan = trace?.span({
48
+ name: "auto-fix",
49
+ input: {
50
+ diagnosisId: diagnosis.diagnosisId,
51
+ prjRepoName: options?.metadata.projectRepoName,
52
+ },
53
+ });
54
+ logger?.log("Trying to fix the test using failure diagnosis. Fetching key moments of the diagnosis");
55
+ const resp = await fetch(diagnosis.keyMomentsUrl);
56
+ // TODO: check for response to be not ok
57
+ if (resp.ok) {
58
+ logger?.success("Successfully fetched key moments of the diagnosis");
59
+ }
60
+ else {
61
+ logger?.warn("Failed to fetch key moments of the diagnosis");
62
+ }
63
+ const screenshotsData = await resp.json();
64
+ const llm = new llm_1.LLM({
65
+ provider: "openai",
66
+ defaultModel: "o1",
67
+ trace,
68
+ });
69
+ // TODO: make this dynamic in nature. the prompts should be made receipe
70
+ // which will help to get rid of if else logic
71
+ // receipe to have:
72
+ // 1. selection criteria
73
+ // 2. job to be done - in this case generate a prompt
74
+ let prompt;
75
+ if (diagnosis.failed_run_metadata.stack.includes("strict mode violation")) {
76
+ prompt = (0, strict_mode_violation_1.fixStrictModeViolationPrompt)({
77
+ screenshotsData,
78
+ diagnosis,
79
+ });
80
+ }
81
+ if (prompt) {
82
+ const llmResponse = await llm.createChatCompletion({
83
+ messages: prompt,
84
+ modelParameters: {
85
+ max_completion_tokens: 40000,
86
+ },
87
+ responseFormat,
88
+ });
89
+ const { observation, action } = JSON.parse(llmResponse?.content);
90
+ failureDiagnosisSpan?.update({
91
+ output: {
92
+ observation,
93
+ action,
94
+ },
95
+ });
96
+ return {
97
+ task: action,
98
+ };
99
+ }
100
+ // TODO: handle default prompt
101
+ return {
102
+ task: "",
103
+ };
104
+ }
105
+ exports.createTaskUsingFailureDiagnosis = createTaskUsingFailureDiagnosis;
@@ -0,0 +1,9 @@
1
+ import { TestErrorDiagnosisDetails } from "@empiricalrun/shared-types";
2
+ export declare function fixStrictModeViolationPrompt({ screenshotsData, diagnosis, }: {
3
+ screenshotsData: {
4
+ success: string[];
5
+ failure: string[];
6
+ };
7
+ diagnosis: TestErrorDiagnosisDetails;
8
+ }): import("openai/resources/index.mjs").ChatCompletionMessageParam[];
9
+ //# sourceMappingURL=strict-mode-violation.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"strict-mode-violation.d.ts","sourceRoot":"","sources":["../../../src/agent/diagnosis-agent/strict-mode-violation.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AAoBvE,wBAAgB,4BAA4B,CAAC,EAC3C,eAAe,EACf,SAAS,GACV,EAAE;IACD,eAAe,EAAE;QAAE,OAAO,EAAE,MAAM,EAAE,CAAC;QAAC,OAAO,EAAE,MAAM,EAAE,CAAA;KAAE,CAAC;IAC1D,SAAS,EAAE,yBAAyB,CAAC;CACtC,qEAiBA"}
@@ -0,0 +1,31 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.fixStrictModeViolationPrompt = void 0;
4
+ const llm_1 = require("@empiricalrun/llm");
5
+ const promptTemplate_0 = "{{#section \"system\"}}\nAs a software engineer, your task is to identify a fix for a failing Playwright test by analyzing screenshots of both the failed and successful test steps.\n\nKey issues for test failures include:\n- Duplicate elements for the same Playwright selector.\n\nInstructions:\n1. Examine the provided successful and failed test screenshots.\n2. Identify the correct element for action based on these observations on the successful test run screenshots.\n3. Sometimes the exact same locator is not available on failed test run screenshot, you need to identify the intent from successful test screenshots and apply that intent in failed test run screenshot to identify the right locator to interact with\n4. Evaluate the playwright selector options provided to you to execute the action. Pick the selector which best matches the intent of the test.\n5. Propose a precise action that addresses the issue.\n\nExample:\n- observation: \n - Current step failure: await page.getByText(\"Audience\").click()\n - Two similar buttons named \"Audience\" exist in the failed run screenshots\n - The successful test run clicked on \"Untracked Audience\"\n - The failed test run should click on \"Untracked Audience\"\n - Available locators: await page.getByText(\"Untracked Audience\").click() contain the selector for Untracked Audience\n- action: Replace failing line with await page.getByText(\"Untracked Audience\").click()\n\nYour action should:\n- Be directly actionable and free of ambiguity, as it will guide another LLM to generate code.\n- Be in natural language and not just code snippet.\n- Be verified as feasible on the failure screen before responding.\n- Choose from the provided possible actions that can be executed on the failure screen.\n- Action should adhere to the format mentioned in the example, i.e. it should start with \"Replace the failing line\" and the updated code with replaced selector following it.\n\nEnsure the action is executable based on the failure screen context before providing it.\n{{/section}}\n\n{{#section \"user\"}}\nSuccessful test screenshots\n\n{{images successScreenshots}}\n\nFailed test screenshots\n\n{{images failedScreenshots}}\n\nStep where test failed:\n{{failingLine}}\n\nOptions for Playwright selectors to perform actions on a failed test screen:\n{{selectorOptions}}\n\n{{/section}}\n\n";
6
+ function extractLocatorOptions(errorStack) {
7
+ // This regex matches a chain of locator API calls following the pattern:
8
+ // functionName(arguments) optionally chained with .functionName(arguments)
9
+ const regex = /aka\s+((?:[A-Za-z0-9_]+\([^)]*\)(?:\.[A-Za-z0-9_]+\([^)]*\))*))/g;
10
+ const options = [];
11
+ let match;
12
+ while ((match = regex.exec(errorStack)) !== null) {
13
+ if (match[1]) {
14
+ options.push(match[1]);
15
+ }
16
+ }
17
+ return options;
18
+ }
19
+ function fixStrictModeViolationPrompt({ screenshotsData, diagnosis, }) {
20
+ const compiledPrompt = (0, llm_1.compilePrompt)(promptTemplate_0, {
21
+ failingLine: diagnosis.failingLine,
22
+ successScreenshots: screenshotsData.success,
23
+ failedScreenshots: screenshotsData.failure,
24
+ selectorOptions: extractLocatorOptions(diagnosis.failed_run_metadata.stack).join("\n"),
25
+ }, {
26
+ imageDetail: "high",
27
+ modelProvider: "openai",
28
+ });
29
+ return compiledPrompt;
30
+ }
31
+ exports.fixStrictModeViolationPrompt = fixStrictModeViolationPrompt;
package/dist/bin/index.js CHANGED
@@ -35,6 +35,7 @@ const run_1 = require("../agent/browsing/run");
35
35
  const utils_1 = require("../agent/browsing/utils");
36
36
  const repo_edit_1 = require("../agent/codegen/repo-edit");
37
37
  const run_2 = require("../agent/codegen/run");
38
+ const diagnosis_agent_1 = require("../agent/diagnosis-agent");
38
39
  const enrich_prompt_1 = require("../agent/enrich-prompt");
39
40
  const infer_agent_1 = require("../agent/infer-agent");
40
41
  const run_3 = require("../agent/planner/run");
@@ -112,6 +113,20 @@ async function runAgent(testGenConfig, span) {
112
113
  });
113
114
  return;
114
115
  }
116
+ // TODO: this needs to be moved to an orchestrator which decides what needs to be done first before executing the sub tasks
117
+ if (testGenConfig.testErrorDiagnosis &&
118
+ testGenConfig.testErrorDiagnosis.failingLine &&
119
+ // TODO: fix this hardcoding of user prompt - ideally its an auto fix intent
120
+ testCase.steps[0] == "Can you please fix the test") {
121
+ const { task: updatedTask } = await (0, diagnosis_agent_1.createTaskUsingFailureDiagnosis)({
122
+ options: testGenConfig.options,
123
+ trace,
124
+ diagnosis: testGenConfig.testErrorDiagnosis,
125
+ });
126
+ if (updatedTask) {
127
+ testCase.steps = [updatedTask];
128
+ }
129
+ }
115
130
  if (!agent || agent === "auto") {
116
131
  agent = await resolveAgentUsingTask({
117
132
  testCase,
@@ -162,7 +177,7 @@ async function runAgent(testGenConfig, span) {
162
177
  // this is where test gen starts executing on giving the command from ci
163
178
  const logger = new logger_1.CustomLogger({ useReporter: false });
164
179
  if (process.argv.length < 3) {
165
- logger.error("Please provide path to scenarios using command:", "npx @empiricalrun/test-gen <TEST_GEN_TOKEN>");
180
+ logger.error("Please provide path to scenarios using command:", "npx @empiricalrun/test-gen --token <TEST_GEN_TOKEN>");
166
181
  process.exit(1);
167
182
  }
168
183
  const { testGenConfig } = await (0, utils_2.parseCliArgs)();
@@ -172,9 +187,9 @@ async function runAgent(testGenConfig, span) {
172
187
  generationId: testGenConfig.options?.metadata.generationId,
173
188
  });
174
189
  (0, session_1.setSessionDetails)({
190
+ testCaseId: testGenConfig.testCase.id,
175
191
  sessionId: testGenConfig.options?.metadata.testSessionId,
176
192
  generationId: testGenConfig.options?.metadata.generationId,
177
- testCaseId: testGenConfig.testCase.id,
178
193
  projectRepoName: testGenConfig.options?.metadata.projectRepoName,
179
194
  });
180
195
  let testGenFailed = false;
@@ -1,5 +1,5 @@
1
1
  import type { TestGenConfig } from "@empiricalrun/shared-types";
2
- export declare function parseCliArgs(testGenToken?: string): Promise<{
2
+ export declare function parseCliArgs(): Promise<{
3
3
  testGenConfig: TestGenConfig;
4
4
  }>;
5
5
  export declare function getTestConfigCliArg(): string;
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/bin/utils/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,4BAA4B,CAAC;AAIhE,wBAAsB,YAAY,CAChC,YAAY,GAAE,MAA8B;;GAM7C;AAED,wBAAgB,mBAAmB,IAAI,MAAM,CAE5C;AAED,eAAO,MAAM,WAAW,oBAA2B,CAAC;AACpD,eAAO,MAAM,OAAO,oBAA6B,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/bin/utils/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,4BAA4B,CAAC;AAKhE,wBAAsB,YAAY;;GAMjC;AAED,wBAAgB,mBAAmB,IAAI,MAAM,CAc5C;AAED,eAAO,MAAM,WAAW,oBAA2B,CAAC;AACpD,eAAO,MAAM,OAAO,oBAA6B,CAAC"}
@@ -1,16 +1,29 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.baggage = exports.sentryTrace = exports.getTestConfigCliArg = exports.parseCliArgs = void 0;
4
+ const logger_1 = require("../logger");
4
5
  const scenarios_1 = require("./scenarios");
5
- async function parseCliArgs(testGenToken = getTestConfigCliArg()) {
6
- const testGenConfig = await (0, scenarios_1.loadTestConfigs)(testGenToken);
6
+ async function parseCliArgs() {
7
+ let rawToken = getTestConfigCliArg();
8
+ const testGenConfig = (0, scenarios_1.loadTestConfigs)(rawToken);
7
9
  return {
8
10
  testGenConfig,
9
11
  };
10
12
  }
11
13
  exports.parseCliArgs = parseCliArgs;
12
14
  function getTestConfigCliArg() {
13
- return process.argv[2];
15
+ // Check for --token parameter
16
+ const tokenIndex = process.argv.indexOf("--token");
17
+ if (tokenIndex !== -1 && process.argv[tokenIndex + 1]) {
18
+ const token = process.argv[tokenIndex + 1];
19
+ if (token)
20
+ return token;
21
+ }
22
+ // Fallback to legacy behavior (token as first argument)
23
+ const legacyToken = process.argv[2];
24
+ const logger = new logger_1.CustomLogger({ useReporter: false });
25
+ logger.warn("Using legacy token format. Consider using --token parameter instead: npx @empiricalrun/test-gen --token <TEST_GEN_TOKEN>");
26
+ return legacyToken;
14
27
  }
15
28
  exports.getTestConfigCliArg = getTestConfigCliArg;
16
29
  exports.sentryTrace = process.env.SENTRY_TRACE;
@@ -1,4 +1,3 @@
1
1
  import type { TestGenConfig } from "@empiricalrun/shared-types";
2
- declare function loadTestConfigs(testGenToken: string): Promise<TestGenConfig>;
3
- export { loadTestConfigs };
2
+ export declare function loadTestConfigs(testGenToken: string): TestGenConfig;
4
3
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/bin/utils/scenarios/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAIV,aAAa,EAEd,MAAM,4BAA4B,CAAC;AAepC,iBAAe,eAAe,CAAC,YAAY,EAAE,MAAM,GAAG,OAAO,CAAC,aAAa,CAAC,CAmB3E;AAED,OAAO,EAAE,eAAe,EAAE,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/bin/utils/scenarios/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAIV,aAAa,EAEd,MAAM,4BAA4B,CAAC;AAepC,wBAAgB,eAAe,CAAC,YAAY,EAAE,MAAM,GAAG,aAAa,CAmBnE"}
@@ -1,7 +1,7 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.loadTestConfigs = void 0;
4
- async function loadTestConfigs(testGenToken) {
4
+ function loadTestConfigs(testGenToken) {
5
5
  const str = decodeURIComponent(atob(testGenToken));
6
6
  const config = JSON.parse(str);
7
7
  const specPath = `./tests/${config.filePath || "index.spec.ts"}`;
package/dist/index.js CHANGED
@@ -31,7 +31,7 @@ require("./initSentry");
31
31
  const llm_1 = require("@empiricalrun/llm");
32
32
  const Sentry = __importStar(require("@sentry/node"));
33
33
  const run_1 = require("./agent/master/run");
34
- const utils_1 = require("./bin/utils");
34
+ const scenarios_1 = require("./bin/utils/scenarios");
35
35
  const client_1 = __importDefault(require("./file/client"));
36
36
  const reporter_1 = require("./reporter");
37
37
  const session_1 = require("./session");
@@ -47,7 +47,7 @@ process.on("SIGTERM", async () => await flushEvents());
47
47
  async function createTest(task, page, scope) {
48
48
  const port = process.env.APP_PORT || 3030;
49
49
  const testConfigArg = process.env.TEST_GEN_TOKEN;
50
- const { testGenConfig } = await (0, utils_1.parseCliArgs)(testConfigArg);
50
+ const testGenConfig = (0, scenarios_1.loadTestConfigs)(testConfigArg);
51
51
  (0, reporter_1.setReporterConfig)({
52
52
  projectRepoName: testGenConfig.options?.metadata.projectRepoName,
53
53
  testSessionId: testGenConfig.options?.metadata.testSessionId,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@empiricalrun/test-gen",
3
- "version": "0.42.29",
3
+ "version": "0.43.1",
4
4
  "publishConfig": {
5
5
  "registry": "https://registry.npmjs.org/",
6
6
  "access": "public"
@@ -72,9 +72,9 @@
72
72
  "ts-morph": "^23.0.0",
73
73
  "tsx": "^4.16.2",
74
74
  "typescript": "^5.3.3",
75
- "@empiricalrun/reporter": "^0.23.1",
75
+ "@empiricalrun/llm": "^0.9.35",
76
76
  "@empiricalrun/r2-uploader": "^0.3.8",
77
- "@empiricalrun/llm": "^0.9.34"
77
+ "@empiricalrun/reporter": "^0.23.1"
78
78
  },
79
79
  "devDependencies": {
80
80
  "@playwright/test": "1.47.1",
@@ -89,7 +89,7 @@
89
89
  "js-levenshtein": "^1.1.6",
90
90
  "playwright": "1.47.1",
91
91
  "ts-patch": "^3.3.0",
92
- "@empiricalrun/shared-types": "0.0.3"
92
+ "@empiricalrun/shared-types": "0.0.4"
93
93
  },
94
94
  "scripts": {
95
95
  "dev": "tspc --build --watch",