@empiricalrun/test-gen 0.34.4 → 0.35.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/CHANGELOG.md +22 -0
  2. package/dist/agent/browsing/index.d.ts +1 -1
  3. package/dist/agent/browsing/index.d.ts.map +1 -1
  4. package/dist/agent/browsing/index.js +11 -12
  5. package/dist/agent/codegen/create-test-block.js +1 -1
  6. package/dist/agent/codegen/run.d.ts +1 -1
  7. package/dist/agent/codegen/run.d.ts.map +1 -1
  8. package/dist/agent/codegen/run.js +12 -10
  9. package/dist/agent/codegen/skills-retriever.d.ts +11 -0
  10. package/dist/agent/codegen/skills-retriever.d.ts.map +1 -1
  11. package/dist/agent/codegen/skills-retriever.js +27 -9
  12. package/dist/agent/codegen/update-flow.d.ts.map +1 -1
  13. package/dist/agent/codegen/update-flow.js +21 -17
  14. package/dist/agent/infer-agent/index.d.ts +0 -1
  15. package/dist/agent/infer-agent/index.d.ts.map +1 -1
  16. package/dist/agent/infer-agent/index.js +4 -5
  17. package/dist/agent/master/run.d.ts +4 -4
  18. package/dist/agent/master/run.d.ts.map +1 -1
  19. package/dist/agent/master/run.js +48 -21
  20. package/dist/agent/master/with-hints.d.ts +1 -1
  21. package/dist/agent/master/with-hints.d.ts.map +1 -1
  22. package/dist/agent/master/with-hints.js +2 -2
  23. package/dist/agent/verification/index.d.ts +2 -3
  24. package/dist/agent/verification/index.d.ts.map +1 -1
  25. package/dist/agent/verification/index.js +34 -9
  26. package/dist/bin/index.js +8 -6
  27. package/dist/evals/fetch-pom-skills-agent.evals.d.ts +4 -0
  28. package/dist/evals/fetch-pom-skills-agent.evals.d.ts.map +1 -0
  29. package/dist/evals/fetch-pom-skills-agent.evals.js +36 -0
  30. package/dist/evals/infer-master-or-code-agent.evals.d.ts +4 -0
  31. package/dist/evals/infer-master-or-code-agent.evals.d.ts.map +1 -0
  32. package/dist/evals/infer-master-or-code-agent.evals.js +22 -0
  33. package/dist/evals/master-agent.evals.d.ts +4 -0
  34. package/dist/evals/master-agent.evals.d.ts.map +1 -0
  35. package/dist/evals/master-agent.evals.js +36 -0
  36. package/dist/evals/type.d.ts +12 -0
  37. package/dist/evals/type.d.ts.map +1 -0
  38. package/dist/evals/type.js +2 -0
  39. package/dist/evals/verification-agent.evals.d.ts +4 -0
  40. package/dist/evals/verification-agent.evals.d.ts.map +1 -0
  41. package/dist/evals/verification-agent.evals.js +23 -0
  42. package/dist/session/index.d.ts.map +1 -1
  43. package/dist/session/index.js +8 -1
  44. package/package.json +2 -2
  45. package/dist/evals/infer-master-code.d.ts +0 -2
  46. package/dist/evals/infer-master-code.d.ts.map +0 -1
  47. package/dist/evals/infer-master-code.js +0 -18
package/CHANGELOG.md CHANGED
@@ -1,5 +1,27 @@
1
1
  # @empiricalrun/test-gen
2
2
 
3
+ ## 0.35.0
4
+
5
+ ### Minor Changes
6
+
7
+ - 069347f: feat: add support for master agent evals
8
+ - 11e4cbd: feat: add fetch skills agent evals
9
+
10
+ ### Patch Changes
11
+
12
+ - 297508d: fix: langfuse key errors
13
+ - Updated dependencies [069347f]
14
+ - Updated dependencies [297508d]
15
+ - @empiricalrun/llm@0.9.21
16
+
17
+ ## 0.34.5
18
+
19
+ ### Patch Changes
20
+
21
+ - dc5718a: feat: add support for evals
22
+ - Updated dependencies [06cf0d8]
23
+ - @empiricalrun/llm@0.9.20
24
+
3
25
  ## 0.34.4
4
26
 
5
27
  ### Patch Changes
@@ -10,7 +10,7 @@ export type BrowsingAgentOptions = Partial<TestGenConfigOptions> & {
10
10
  };
11
11
  export declare function executeTaskUsingBrowsingAgent({ trace, action, logger, page, options, llm, actions, }: {
12
12
  action: string;
13
- trace: TraceClient;
13
+ trace?: TraceClient;
14
14
  logger: CustomLogger;
15
15
  page: Page;
16
16
  options: BrowsingAgentOptions;
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/browsing/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAElC,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAIhD,OAAO,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAMnD,MAAM,MAAM,oBAAoB,GAAG,OAAO,CAAC,oBAAoB,CAAC,GAAG;IACjE,YAAY,CAAC,EAAE;QACb,iBAAiB,CAAC,EAAE,MAAM,EAAE,CAAC;KAC9B,CAAC;CACH,CAAC;AAEF,wBAAsB,6BAA6B,CAAC,EAClD,KAAK,EACL,MAAM,EACN,MAAM,EACN,IAAI,EACJ,OAAO,EACP,GAAG,EACH,OAAO,GACR,EAAE;IACD,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,WAAW,CAAC;IACnB,MAAM,EAAE,YAAY,CAAC;IACrB,IAAI,EAAE,IAAI,CAAC;IACX,OAAO,EAAE,oBAAoB,CAAC;IAC9B,GAAG,EAAE,GAAG,CAAC;IACT,OAAO,EAAE,iBAAiB,CAAC;CAC5B,iBAyIA"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/browsing/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAElC,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAIhD,OAAO,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAMnD,MAAM,MAAM,oBAAoB,GAAG,OAAO,CAAC,oBAAoB,CAAC,GAAG;IACjE,YAAY,CAAC,EAAE;QACb,iBAAiB,CAAC,EAAE,MAAM,EAAE,CAAC;KAC9B,CAAC;CACH,CAAC;AAEF,wBAAsB,6BAA6B,CAAC,EAClD,KAAK,EACL,MAAM,EACN,MAAM,EACN,IAAI,EACJ,OAAO,EACP,GAAG,EACH,OAAO,GACR,EAAE;IACD,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,MAAM,EAAE,YAAY,CAAC;IACrB,IAAI,EAAE,IAAI,CAAC;IACX,OAAO,EAAE,oBAAoB,CAAC;IAC9B,GAAG,EAAE,GAAG,CAAC;IACT,OAAO,EAAE,iBAAiB,CAAC;CAC5B,iBAwIA"}
@@ -15,30 +15,29 @@ async function executeTaskUsingBrowsingAgent({ trace, action, logger, page, opti
15
15
  const tools = actions.getBrowsingActionSchemas();
16
16
  const testgenUpdatesReporter = new reporter_1.TestGenUpdatesReporter();
17
17
  while (!isTaskDone) {
18
- const browsingAgentSpan = trace.span({
18
+ const browsingAgentSpan = trace?.span({
19
19
  name: `browsing-agent`,
20
20
  });
21
21
  if (await (0, session_1.shouldStopSession)()) {
22
22
  break;
23
23
  }
24
- const pageContentSpan = browsingAgentSpan.span({
24
+ const pageContentSpan = browsingAgentSpan?.span({
25
25
  name: "page-content",
26
26
  });
27
27
  const pageContent = await page.content();
28
- pageContentSpan.end({ output: { pageContent } });
29
- const sanitizationSpan = browsingAgentSpan.span({
28
+ pageContentSpan?.end({ output: { pageContent } });
29
+ const sanitizationSpan = browsingAgentSpan?.span({
30
30
  name: "page-sanitization",
31
31
  });
32
32
  const pageSnapshot = (0, html_1.sanitizeHtml)(pageContent, options.htmlSanitize);
33
- sanitizationSpan.end({ output: { pageSnapshot } });
34
- const promptSpan = browsingAgentSpan.span({ name: "page-prompt" });
33
+ sanitizationSpan?.end({ output: { pageSnapshot } });
34
+ const promptSpan = browsingAgentSpan?.span({ name: "page-prompt" });
35
35
  // extract all successful actions
36
36
  const successfulActions = executedActions
37
37
  .filter((a) => !a.isError)
38
38
  .map((a) => a.action);
39
39
  if (successfulActions.length > 0) {
40
40
  const verificationAgentResp = await (0, verification_1.verificationAgent)({
41
- llm,
42
41
  trace: browsingAgentSpan,
43
42
  task: action,
44
43
  conversation: ["Successfully executed actions", ...successfulActions],
@@ -47,8 +46,8 @@ async function executeTaskUsingBrowsingAgent({ trace, action, logger, page, opti
47
46
  logger.log(`isTaskDone: ${isTaskDone}`);
48
47
  logger.log(`reason: ${verificationAgentResp.reason}`);
49
48
  if (isTaskDone) {
50
- browsingAgentSpan.event({ name: "task-done" });
51
- browsingAgentSpan.end({
49
+ browsingAgentSpan?.event({ name: "task-done" });
50
+ browsingAgentSpan?.end({
52
51
  output: { taskDone: true, reason: verificationAgentResp.reason },
53
52
  });
54
53
  break;
@@ -61,7 +60,7 @@ async function executeTaskUsingBrowsingAgent({ trace, action, logger, page, opti
61
60
  lastActionErrors: lastActionExecTrace ? [lastActionExecTrace] : [],
62
61
  promptType: "browsing-agent-as-tool",
63
62
  });
64
- promptSpan.end({ output: { messages } });
63
+ promptSpan?.end({ output: { messages } });
65
64
  let completion;
66
65
  completion = await (0, o1_completion_1.getO1Completion)({
67
66
  //@ts-ignore
@@ -92,7 +91,7 @@ async function executeTaskUsingBrowsingAgent({ trace, action, logger, page, opti
92
91
  action: "",
93
92
  });
94
93
  }
95
- const toolCallsSpan = browsingAgentSpan.span({ name: "tool-calls" });
94
+ const toolCallsSpan = browsingAgentSpan?.span({ name: "tool-calls" });
96
95
  for (const i in toolCalls) {
97
96
  const toolCall = toolCalls[i];
98
97
  if (await (0, session_1.shouldStopSession)()) {
@@ -117,7 +116,7 @@ async function executeTaskUsingBrowsingAgent({ trace, action, logger, page, opti
117
116
  logger.error(lastActionExecTrace, e);
118
117
  }
119
118
  }
120
- toolCallsSpan.end({ output: { toolCalls } });
119
+ toolCallsSpan?.end({ output: { toolCalls } });
121
120
  // mark task as done if llm is stuck in loop
122
121
  if (executedActions.length >= 3) {
123
122
  const lastThreeActions = executedActions.slice(-3);
@@ -15,7 +15,7 @@ async function createEmptyTestCaseBlock({ testCase, file, options, trace, }) {
15
15
  const session = (0, session_1.getSessionDetails)();
16
16
  trace =
17
17
  trace ||
18
- llm_1.langfuseInstance.trace({
18
+ llm_1.langfuseInstance?.trace({
19
19
  name: "create-empty-test-block",
20
20
  id: crypto.randomUUID(),
21
21
  release: session.version,
@@ -1,4 +1,4 @@
1
1
  import { TraceClient } from "@empiricalrun/llm";
2
2
  import { TestCase, TestGenConfigOptions } from "../../types";
3
- export declare function generateTest(testCase: TestCase, file: string, options: TestGenConfigOptions, trace: TraceClient): Promise<TestCase[]>;
3
+ export declare function generateTest(testCase: TestCase, file: string, options: TestGenConfigOptions, trace?: TraceClient): Promise<TestCase[]>;
4
4
  //# sourceMappingURL=run.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/agent/codegen/run.ts"],"names":[],"mappings":"AAAA,OAAO,EAAkC,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAkBhF,OAAO,EAAE,QAAQ,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAI7D,wBAAsB,YAAY,CAChC,QAAQ,EAAE,QAAQ,EAClB,IAAI,EAAE,MAAM,EACZ,OAAO,EAAE,oBAAoB,EAC7B,KAAK,EAAE,WAAW,GACjB,OAAO,CAAC,QAAQ,EAAE,CAAC,CAwGrB"}
1
+ {"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/agent/codegen/run.ts"],"names":[],"mappings":"AAAA,OAAO,EAAkC,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAkBhF,OAAO,EAAE,QAAQ,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAI7D,wBAAsB,YAAY,CAChC,QAAQ,EAAE,QAAQ,EAClB,IAAI,EAAE,MAAM,EACZ,OAAO,EAAE,oBAAoB,EAC7B,KAAK,CAAC,EAAE,WAAW,GAClB,OAAO,CAAC,QAAQ,EAAE,CAAC,CA0GrB"}
@@ -31,7 +31,7 @@ async function generateTest(testCase, file, options, trace) {
31
31
  }
32
32
  const generatedTestCases = [];
33
33
  logger.logEmptyLine();
34
- const createTestSpan = trace.span({
34
+ const createTestSpan = trace?.span({
35
35
  name: "create-test",
36
36
  input: {
37
37
  testCase,
@@ -39,7 +39,7 @@ async function generateTest(testCase, file, options, trace) {
39
39
  options,
40
40
  },
41
41
  });
42
- createTestSpan.event({
42
+ createTestSpan?.event({
43
43
  name: "collate-files-as-text",
44
44
  output: {
45
45
  codePrompt,
@@ -47,7 +47,7 @@ async function generateTest(testCase, file, options, trace) {
47
47
  testFileContent,
48
48
  },
49
49
  });
50
- const promptSpan = createTestSpan.span({
50
+ const promptSpan = createTestSpan?.span({
51
51
  name: "add-scenario-prompt",
52
52
  });
53
53
  const instruction = await (0, llm_1.getPrompt)("add-scenario", {
@@ -57,7 +57,7 @@ async function generateTest(testCase, file, options, trace) {
57
57
  scenarioSteps: testCase.steps.join("\n"),
58
58
  scenarioFile: file,
59
59
  });
60
- promptSpan.end({ output: { instruction } });
60
+ promptSpan?.end({ output: { instruction } });
61
61
  const llm = new llm_1.LLM({
62
62
  trace,
63
63
  provider: options.modelProvider || constants_1.DEFAULT_MODEL_PROVIDER,
@@ -73,7 +73,7 @@ async function generateTest(testCase, file, options, trace) {
73
73
  });
74
74
  let response = firstShotMessage?.content || "";
75
75
  logger.success("Test generated successfully!");
76
- const readWriteFileSpan = trace.span({ name: "write-to-file" });
76
+ const readWriteFileSpan = trace?.span({ name: "write-to-file" });
77
77
  let contents = fs_extra_1.default.readFileSync(file, "utf-8");
78
78
  const [prependContent, strippedContent] = await (0, web_1.stripAndPrependImports)(response, testCase?.name);
79
79
  let updatedContent = prependContent +
@@ -83,9 +83,9 @@ async function generateTest(testCase, file, options, trace) {
83
83
  codeSnippet: `\n\n${strippedContent}`,
84
84
  });
85
85
  await fs_extra_1.default.writeFile(file, updatedContent, "utf-8");
86
- readWriteFileSpan.end({ output: { updatedContent } });
86
+ readWriteFileSpan?.end({ output: { updatedContent } });
87
87
  logger.log("Linting generated code...");
88
- createTestSpan.event({ name: "lint-file" });
88
+ createTestSpan?.event({ name: "lint-file" });
89
89
  await (0, web_1.lintErrors)(file);
90
90
  await (0, fix_ts_errors_1.validateAndFixTypescriptErrors)({
91
91
  trace,
@@ -96,12 +96,14 @@ async function generateTest(testCase, file, options, trace) {
96
96
  testCase: testCase,
97
97
  options,
98
98
  });
99
- createTestSpan.event({ name: "format-file" });
99
+ createTestSpan?.event({ name: "format-file" });
100
100
  await (0, web_1.formatCode)(file);
101
101
  logger.success("File formatted successfully!");
102
- logger.log(`Successfully generated code for the given task. \n View [trace](${trace.getTraceUrl()})`);
102
+ if (trace) {
103
+ logger.log(`Successfully generated code for the given task. \n View [trace](${trace.getTraceUrl()})`);
104
+ }
103
105
  generatedTestCases.push(testCase);
104
- createTestSpan.end({ output: { response } });
106
+ createTestSpan?.end({ output: { response } });
105
107
  await (0, llm_1.flushAllTraces)();
106
108
  return generatedTestCases;
107
109
  }
@@ -1,5 +1,16 @@
1
1
  import { TraceClient } from "@empiricalrun/llm";
2
2
  import { TestCase, TestGenConfigOptions } from "../../types";
3
+ export declare const fetchPomSkills: ({ testCase, pomFiles, options, trace, }: {
4
+ testCase: TestCase;
5
+ pomFiles?: string | undefined;
6
+ trace?: TraceClient | undefined;
7
+ options?: TestGenConfigOptions | undefined;
8
+ }) => Promise<{
9
+ testStep: string;
10
+ filePath: string;
11
+ usageExample: string;
12
+ reason: string;
13
+ }[]>;
3
14
  export declare function getAppropriateSkills({ testCase, options, trace, }: {
4
15
  testCase: TestCase;
5
16
  options?: TestGenConfigOptions;
@@ -1 +1 @@
1
- {"version":3,"file":"skills-retriever.d.ts","sourceRoot":"","sources":["../../../src/agent/codegen/skills-retriever.ts"],"names":[],"mappings":"AAAA,OAAO,EAAkB,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAYhE,OAAO,EAAE,QAAQ,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAG7D,wBAAsB,oBAAoB,CAAC,EACzC,QAAQ,EACR,OAAO,EACP,KAAK,GACN,EAAE;IACD,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,CAAC,EAAE,oBAAoB,CAAC;IAC/B,KAAK,CAAC,EAAE,WAAW,CAAC;CACrB;;;;;KAmDA"}
1
+ {"version":3,"file":"skills-retriever.d.ts","sourceRoot":"","sources":["../../../src/agent/codegen/skills-retriever.ts"],"names":[],"mappings":"AAAA,OAAO,EAAkB,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAYhE,OAAO,EAAE,QAAQ,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAG7D,eAAO,MAAM,cAAc;cAMf,QAAQ;;;;;;;;;IA0CnB,CAAC;AAEF,wBAAsB,oBAAoB,CAAC,EACzC,QAAQ,EACR,OAAO,EACP,KAAK,GACN,EAAE;IACD,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,CAAC,EAAE,oBAAoB,CAAC;IAC/B,KAAK,CAAC,EAAE,WAAW,CAAC;CACrB;;;;;KA6BA"}
@@ -3,7 +3,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
3
3
  return (mod && mod.__esModule) ? mod : { "default": mod };
4
4
  };
5
5
  Object.defineProperty(exports, "__esModule", { value: true });
6
- exports.getAppropriateSkills = void 0;
6
+ exports.getAppropriateSkills = exports.fetchPomSkills = void 0;
7
7
  const llm_1 = require("@empiricalrun/llm");
8
8
  const fs_1 = __importDefault(require("fs"));
9
9
  const logger_1 = require("../../bin/logger");
@@ -11,18 +11,15 @@ const context_1 = require("../../bin/utils/context");
11
11
  const fs_2 = require("../../bin/utils/fs");
12
12
  const constants_1 = require("../../constants");
13
13
  const utils_1 = require("./utils");
14
- async function getAppropriateSkills({ testCase, options, trace, }) {
15
- const logger = new logger_1.CustomLogger({ useReporter: false });
16
- logger.log("getting skill set for the repository");
17
- const filter = await (0, context_1.createGitIgnoreFileFilter)();
18
- const pomFiles = await (0, fs_2.generatePromptFromDirectory)("./pages", filter);
19
- const fetchSkillsSpan = trace?.span({
14
+ const fetchPomSkills = async ({ testCase, pomFiles, options, trace, }) => {
15
+ const fetchSkillsUsingPOMFilesSpan = trace?.span({
20
16
  name: "fetch-pom-skills",
21
17
  input: {
18
+ pomFiles,
22
19
  testCase,
23
20
  },
24
21
  });
25
- const promptSpan = fetchSkillsSpan?.span({
22
+ const promptSpan = fetchSkillsUsingPOMFilesSpan?.span({
26
23
  name: "fetch-pom-skills-prompt",
27
24
  });
28
25
  const prompt = await (0, llm_1.getPrompt)("fetch-skills-prompt", {
@@ -32,7 +29,7 @@ async function getAppropriateSkills({ testCase, options, trace, }) {
32
29
  });
33
30
  promptSpan?.end({ output: { prompt } });
34
31
  const llm = new llm_1.LLM({
35
- trace: fetchSkillsSpan,
32
+ trace: fetchSkillsUsingPOMFilesSpan,
36
33
  provider: options?.modelProvider || constants_1.DEFAULT_MODEL_PROVIDER,
37
34
  defaultModel: options?.model || constants_1.DEFAULT_MODEL,
38
35
  providerApiKey: constants_1.MODEL_API_KEYS[options?.modelProvider || constants_1.DEFAULT_MODEL_PROVIDER],
@@ -47,6 +44,27 @@ async function getAppropriateSkills({ testCase, options, trace, }) {
47
44
  });
48
45
  let response = firstShotMessage?.content || "";
49
46
  const skills = (0, utils_1.extractTestStepsSuggestions)(response);
47
+ fetchSkillsUsingPOMFilesSpan?.end({ output: { skills } });
48
+ return skills;
49
+ };
50
+ exports.fetchPomSkills = fetchPomSkills;
51
+ async function getAppropriateSkills({ testCase, options, trace, }) {
52
+ const logger = new logger_1.CustomLogger({ useReporter: false });
53
+ logger.log("getting skill set for the repository");
54
+ const filter = await (0, context_1.createGitIgnoreFileFilter)();
55
+ const pomFiles = await (0, fs_2.generatePromptFromDirectory)("./pages", filter);
56
+ const fetchSkillsSpan = trace?.span({
57
+ name: "get-appropriate-skills",
58
+ input: {
59
+ testCase,
60
+ },
61
+ });
62
+ const skills = await (0, exports.fetchPomSkills)({
63
+ testCase,
64
+ pomFiles,
65
+ trace: fetchSkillsSpan,
66
+ options,
67
+ });
50
68
  const validateSkillsSpan = fetchSkillsSpan?.span({
51
69
  name: "validate-skills",
52
70
  input: {
@@ -1 +1 @@
1
- {"version":3,"file":"update-flow.d.ts","sourceRoot":"","sources":["../../../src/agent/codegen/update-flow.ts"],"names":[],"mappings":"AAAA,OAAO,EAKL,WAAW,EACZ,MAAM,mBAAmB,CAAC;AAsB3B,OAAO,EAAE,QAAQ,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAI7D,KAAK,eAAe,GAAG,QAAQ,GAAG;IAChC,YAAY,EAAE,MAAM,EAAE,CAAC;CACxB,CAAC;AAqIF,wBAAsB,UAAU,CAC9B,QAAQ,EAAE,QAAQ,EAClB,IAAI,EAAE,MAAM,EACZ,OAAO,EAAE,oBAAoB,GAAG,SAAS,EACzC,OAAO,GAAE,OAAc,EACvB,QAAQ,GAAE,OAAc,EACxB,KAAK,CAAC,EAAE,WAAW,GAClB,OAAO,CAAC,eAAe,EAAE,CAAC,CAoG5B;AAED,wBAAsB,qBAAqB,CAAC,EAC1C,QAAQ,EACR,IAAI,EACJ,OAAO,EACP,KAAK,EACL,aAAoB,GACrB,EAAE;IACD,QAAQ,EAAE,QAAQ,CAAC;IACnB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,oBAAoB,CAAC;IAC/B,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,aAAa,CAAC,EAAE,OAAO,CAAC;CACzB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC,CA6E7B"}
1
+ {"version":3,"file":"update-flow.d.ts","sourceRoot":"","sources":["../../../src/agent/codegen/update-flow.ts"],"names":[],"mappings":"AAAA,OAAO,EAKL,WAAW,EACZ,MAAM,mBAAmB,CAAC;AAsB3B,OAAO,EAAE,QAAQ,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAI7D,KAAK,eAAe,GAAG,QAAQ,GAAG;IAChC,YAAY,EAAE,MAAM,EAAE,CAAC;CACxB,CAAC;AAqIF,wBAAsB,UAAU,CAC9B,QAAQ,EAAE,QAAQ,EAClB,IAAI,EAAE,MAAM,EACZ,OAAO,EAAE,oBAAoB,GAAG,SAAS,EACzC,OAAO,GAAE,OAAc,EACvB,QAAQ,GAAE,OAAc,EACxB,KAAK,CAAC,EAAE,WAAW,GAClB,OAAO,CAAC,eAAe,EAAE,CAAC,CAsG5B;AAED,wBAAsB,qBAAqB,CAAC,EAC1C,QAAQ,EACR,IAAI,EACJ,OAAO,EACP,KAAK,EACL,aAAoB,GACrB,EAAE;IACD,QAAQ,EAAE,QAAQ,CAAC;IACnB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,oBAAoB,CAAC;IAC/B,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,aAAa,CAAC,EAAE,OAAO,CAAC;CACzB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC,CA+E7B"}
@@ -29,7 +29,7 @@ async function applyFileChanges({ validateTypes = true, trace, testCase, fileCha
29
29
  if (testBlockUpdate) {
30
30
  // assuming the test case getting updated
31
31
  // maintaining the previous accuracy of the test case update
32
- const readWriteFileSpan = trace.span({ name: "write-to-file" });
32
+ const readWriteFileSpan = trace?.span({ name: "write-to-file" });
33
33
  let contents = await fs_extra_1.default.readFile(fileChange.filePath, "utf-8");
34
34
  const [prependContent, strippedContent] = await (0, web_1.stripAndPrependImports)(fileChange.newCode, testCase?.name);
35
35
  let updatedContent = prependContent + contents + `\n\n${strippedContent}`;
@@ -41,10 +41,10 @@ async function applyFileChanges({ validateTypes = true, trace, testCase, fileCha
41
41
  contents = contents.replace(testBlock, `\n\n${strippedContent}`);
42
42
  updatedContent = prependContent + contents;
43
43
  await fs_extra_1.default.writeFile(fileChange.filePath, updatedContent, "utf-8");
44
- readWriteFileSpan.end({ output: { updatedContent } });
44
+ readWriteFileSpan?.end({ output: { updatedContent } });
45
45
  }
46
46
  else {
47
- const readWriteFileSpan = trace.span({ name: "write-to-file" });
47
+ const readWriteFileSpan = trace?.span({ name: "write-to-file" });
48
48
  let contents = await fs_extra_1.default.readFile(fileChange.filePath, "utf-8");
49
49
  const project = new ts_morph_1.Project();
50
50
  const sourceFile = project.createSourceFile("updated-code.ts", fileChange.newCode);
@@ -84,7 +84,7 @@ async function applyFileChanges({ validateTypes = true, trace, testCase, fileCha
84
84
  contents = contents.replace(fileChange.oldCode, `\n\n${fileChange.newCode}`);
85
85
  }
86
86
  await fs_extra_1.default.writeFile(fileChange.filePath, contents, "utf-8");
87
- readWriteFileSpan.end({ output: { contents } });
87
+ readWriteFileSpan?.end({ output: { contents } });
88
88
  }
89
89
  // format and validate file change
90
90
  if (validateTypes) {
@@ -98,7 +98,7 @@ async function applyFileChanges({ validateTypes = true, trace, testCase, fileCha
98
98
  options: testGenOptions,
99
99
  });
100
100
  }
101
- trace.event({ name: "format-file" });
101
+ trace?.event({ name: "format-file" });
102
102
  await (0, web_1.formatCode)(fileChange.filePath);
103
103
  logger.success(`${fileChange.filePath} file formatted successfully!`);
104
104
  }));
@@ -112,7 +112,7 @@ async function updateTest(testCase, file, options, logging = true, validate = tr
112
112
  const session = (0, session_1.getSessionDetails)();
113
113
  trace =
114
114
  trace ||
115
- llm_1.langfuseInstance.trace({
115
+ llm_1.langfuseInstance?.trace({
116
116
  name: "update-test",
117
117
  id: crypto_1.default.randomUUID(),
118
118
  release: session.version,
@@ -121,7 +121,7 @@ async function updateTest(testCase, file, options, logging = true, validate = tr
121
121
  options?.metadata.environment || "",
122
122
  ].filter((s) => !!s),
123
123
  });
124
- const updateTestSpan = trace.span({
124
+ const updateTestSpan = trace?.span({
125
125
  name: "update-test",
126
126
  input: {
127
127
  testCase,
@@ -129,7 +129,7 @@ async function updateTest(testCase, file, options, logging = true, validate = tr
129
129
  options,
130
130
  },
131
131
  });
132
- updateTestSpan.event({
132
+ updateTestSpan?.event({
133
133
  name: "collate-files-as-text",
134
134
  output: {
135
135
  codePrompt,
@@ -137,7 +137,7 @@ async function updateTest(testCase, file, options, logging = true, validate = tr
137
137
  testFileContent,
138
138
  },
139
139
  });
140
- const promptSpan = updateTestSpan.span({
140
+ const promptSpan = updateTestSpan?.span({
141
141
  name: "update-scenario-prompt",
142
142
  });
143
143
  const promptName = "update-scenario";
@@ -161,7 +161,7 @@ async function updateTest(testCase, file, options, logging = true, validate = tr
161
161
  scenarioFile: file,
162
162
  currentScenarioCodeBlock,
163
163
  });
164
- promptSpan.end({ output: { instruction } });
164
+ promptSpan?.end({ output: { instruction } });
165
165
  const llm = new llm_1.LLM({
166
166
  trace: updateTestSpan,
167
167
  provider: options?.modelProvider || constants_1.DEFAULT_MODEL_PROVIDER,
@@ -188,12 +188,14 @@ async function updateTest(testCase, file, options, logging = true, validate = tr
188
188
  pomPrompt: pomPrompt,
189
189
  codePrompt: codePrompt,
190
190
  });
191
- logger.log(`Trace: ${trace.getTraceUrl()}`);
191
+ if (trace) {
192
+ logger.log(`Trace: ${trace?.getTraceUrl()}`);
193
+ }
192
194
  generatedTestCases.push({
193
195
  ...testCase,
194
196
  updatedFiles: fileChanges.map((f) => f.filePath),
195
197
  });
196
- updateTestSpan.end({ output: { response } });
198
+ updateTestSpan?.end({ output: { response } });
197
199
  await (0, llm_1.flushAllTraces)();
198
200
  return generatedTestCases;
199
201
  }
@@ -215,7 +217,7 @@ async function appendCreateTestBlock({ testCase, file, options, trace, validateT
215
217
  const session = (0, session_1.getSessionDetails)();
216
218
  trace =
217
219
  trace ||
218
- llm_1.langfuseInstance.trace({
220
+ llm_1.langfuseInstance?.trace({
219
221
  name: "append-create-test-block",
220
222
  id: crypto_1.default.randomUUID(),
221
223
  release: session.version,
@@ -225,7 +227,7 @@ async function appendCreateTestBlock({ testCase, file, options, trace, validateT
225
227
  ].filter((s) => !!s),
226
228
  });
227
229
  const promptName = "append-create-test-block";
228
- const promptSpan = trace.span({
230
+ const promptSpan = trace?.span({
229
231
  name: "append-create-test-block-prompt",
230
232
  });
231
233
  const instruction = await (0, llm_1.getPrompt)(promptName, {
@@ -235,7 +237,7 @@ async function appendCreateTestBlock({ testCase, file, options, trace, validateT
235
237
  scenarioSteps: testCase.steps.join("\n"),
236
238
  scenarioFile: file,
237
239
  });
238
- promptSpan.end({ output: { instruction } });
240
+ promptSpan?.end({ output: { instruction } });
239
241
  const [userInstruction] = instruction.filter((s) => s.role === "user");
240
242
  const [systemInstruction] = instruction.filter((s) => s.role === "system");
241
243
  userInstruction.content = `${systemInstruction?.content}
@@ -267,12 +269,14 @@ async function appendCreateTestBlock({ testCase, file, options, trace, validateT
267
269
  codePrompt: codePrompt,
268
270
  validateTypes,
269
271
  });
270
- logger.log(`Trace: ${trace.getTraceUrl()}`);
272
+ if (trace) {
273
+ logger.log(`Trace: ${trace.getTraceUrl()}`);
274
+ }
271
275
  generatedTestCases.push({
272
276
  ...testCase,
273
277
  updatedFiles: fileChanges.map((f) => f.filePath),
274
278
  });
275
- trace.update({ input: { testCase }, output: { response } });
279
+ trace?.update({ input: { testCase }, output: { response } });
276
280
  await (0, llm_1.flushAllTraces)();
277
281
  return generatedTestCases;
278
282
  }
@@ -6,6 +6,5 @@ export declare function inferAgentBasedTask({ task, options, trace, }: {
6
6
  trace?: TraceClient;
7
7
  }): Promise<{
8
8
  response: Agent;
9
- trace: TraceClient;
10
9
  }>;
11
10
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/infer-agent/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAIL,WAAW,EACZ,MAAM,mBAAmB,CAAC;AAS3B,OAAO,EAAE,KAAK,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAG1D,wBAAsB,mBAAmB,CAAC,EACxC,IAAI,EACJ,OAAO,EACP,KAAK,GACN,EAAE;IACD,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,oBAAoB,CAAC;IAC/B,KAAK,CAAC,EAAE,WAAW,CAAC;CACrB,GAAG,OAAO,CAAC;IAAE,QAAQ,EAAE,KAAK,CAAC;IAAC,KAAK,EAAE,WAAW,CAAA;CAAE,CAAC,CA6EnD"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/infer-agent/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAIL,WAAW,EACZ,MAAM,mBAAmB,CAAC;AAS3B,OAAO,EAAE,KAAK,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAG1D,wBAAsB,mBAAmB,CAAC,EACxC,IAAI,EACJ,OAAO,EACP,KAAK,GACN,EAAE;IACD,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,oBAAoB,CAAC;IAC/B,KAAK,CAAC,EAAE,WAAW,CAAC;CACrB,GAAG,OAAO,CAAC;IAAE,QAAQ,EAAE,KAAK,CAAA;CAAE,CAAC,CA4E/B"}
@@ -8,19 +8,19 @@ const session = (0, session_1.getSessionDetails)();
8
8
  async function inferAgentBasedTask({ task, options, trace, }) {
9
9
  trace =
10
10
  trace ||
11
- llm_1.langfuseInstance.trace({
11
+ llm_1.langfuseInstance?.trace({
12
12
  name: "infer-agent-task",
13
13
  id: crypto.randomUUID(),
14
14
  release: session.version,
15
15
  });
16
- const inferAgentSpan = trace.span({
16
+ const inferAgentSpan = trace?.span({
17
17
  name: "infer-agent",
18
18
  input: {
19
19
  task,
20
20
  options,
21
21
  },
22
22
  });
23
- const promptSpan = inferAgentSpan.span({
23
+ const promptSpan = inferAgentSpan?.span({
24
24
  name: "infer-agent-prompt",
25
25
  input: {
26
26
  task,
@@ -65,7 +65,7 @@ async function inferAgentBasedTask({ task, options, trace, }) {
65
65
  },
66
66
  });
67
67
  const output = JSON.parse(firstShotMessage?.content || "{}");
68
- inferAgentSpan.end({
68
+ inferAgentSpan?.end({
69
69
  output: {
70
70
  response: output.response,
71
71
  reason: output.reason,
@@ -73,7 +73,6 @@ async function inferAgentBasedTask({ task, options, trace, }) {
73
73
  });
74
74
  return {
75
75
  response: output.response,
76
- trace: inferAgentSpan,
77
76
  };
78
77
  }
79
78
  exports.inferAgentBasedTask = inferAgentBasedTask;
@@ -3,14 +3,14 @@ import { Page } from "playwright";
3
3
  import { PlaywrightActions } from "../../actions";
4
4
  import { TestCase } from "../../types";
5
5
  import { BrowsingAgentOptions } from "../browsing";
6
- export declare function getNextAction({ task, executedActions, failedActions, page, trace, llm, options, pageScreenshot, annotatedPageScreenshot, actions, disableSkills, useHints, }: {
6
+ export declare function getNextAction({ task, executedActions, failedActions, pageUrl, trace, llm, options, pageScreenshot, annotatedPageScreenshot, actions, disableSkills, useHints, }: {
7
7
  task: string;
8
8
  executedActions: string[];
9
9
  failedActions: any[];
10
- page: Page;
10
+ pageUrl: string;
11
11
  trace?: TraceClient;
12
- llm: LLM;
13
- options: BrowsingAgentOptions;
12
+ llm?: LLM;
13
+ options?: BrowsingAgentOptions;
14
14
  pageScreenshot: string;
15
15
  annotatedPageScreenshot?: string;
16
16
  actions: PlaywrightActions;
@@ -1 +1 @@
1
- {"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/agent/master/run.ts"],"names":[],"mappings":"AAAA,OAAO,EAGL,GAAG,EACH,WAAW,EACZ,MAAM,mBAAmB,CAAC;AAG3B,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAElC,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAYlD,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AACvC,OAAO,EACL,oBAAoB,EAErB,MAAM,aAAa,CAAC;AAQrB,wBAAsB,aAAa,CAAC,EAClC,IAAI,EACJ,eAAe,EACf,aAAa,EACb,IAAI,EACJ,KAAK,EACL,GAAG,EACH,OAAO,EACP,cAAc,EACd,uBAAuB,EACvB,OAAO,EACP,aAAa,EACb,QAAgB,GACjB,EAAE;IACD,IAAI,EAAE,MAAM,CAAC;IACb,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,aAAa,EAAE,GAAG,EAAE,CAAC;IACrB,IAAI,EAAE,IAAI,CAAC;IACX,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,GAAG,EAAE,GAAG,CAAC;IACT,OAAO,EAAE,oBAAoB,CAAC;IAC9B,cAAc,EAAE,MAAM,CAAC;IACvB,uBAAuB,CAAC,EAAE,MAAM,CAAC;IACjC,OAAO,EAAE,iBAAiB,CAAC;IAC3B,aAAa,EAAE,OAAO,CAAC;IACvB,QAAQ,EAAE,OAAO,CAAC;CACnB,2FAgEA;AAGD,wBAAsB,0BAA0B,CAAC,EAC/C,IAAI,EACJ,IAAI,EACJ,QAAQ,EACR,OAAO,GACR,EAAE;IACD,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,IAAI,CAAC;IACX,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,EAAE,oBAAoB,CAAC;CAC/B;;;GA8QA"}
1
+ {"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/agent/master/run.ts"],"names":[],"mappings":"AAAA,OAAO,EAGL,GAAG,EACH,WAAW,EACZ,MAAM,mBAAmB,CAAC;AAG3B,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAElC,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAYlD,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AACvC,OAAO,EACL,oBAAoB,EAErB,MAAM,aAAa,CAAC;AAQrB,wBAAsB,aAAa,CAAC,EAClC,IAAI,EACJ,eAAe,EACf,aAAa,EACb,OAAO,EACP,KAAK,EACL,GAAG,EACH,OAAO,EACP,cAAc,EACd,uBAAuB,EACvB,OAAO,EACP,aAAa,EACb,QAAgB,GACjB,EAAE;IACD,IAAI,EAAE,MAAM,CAAC;IACb,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,aAAa,EAAE,GAAG,EAAE,CAAC;IACrB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,GAAG,CAAC,EAAE,GAAG,CAAC;IACV,OAAO,CAAC,EAAE,oBAAoB,CAAC;IAC/B,cAAc,EAAE,MAAM,CAAC;IACvB,uBAAuB,CAAC,EAAE,MAAM,CAAC;IACjC,OAAO,EAAE,iBAAiB,CAAC;IAC3B,aAAa,EAAE,OAAO,CAAC;IACvB,QAAQ,EAAE,OAAO,CAAC;CACnB,2FAwFA;AAGD,wBAAsB,0BAA0B,CAAC,EAC/C,IAAI,EACJ,IAAI,EACJ,QAAQ,EACR,OAAO,GACR,EAAE;IACD,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,IAAI,CAAC;IACX,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,EAAE,oBAAoB,CAAC;CAC/B;;;GAiRA"}
@@ -16,13 +16,30 @@ const skills_retriever_1 = require("../codegen/skills-retriever");
16
16
  const verification_1 = require("../verification");
17
17
  const with_hints_1 = require("./with-hints");
18
18
  const MAX_ERROR_COUNT = 2;
19
- async function getNextAction({ task, executedActions, failedActions, page, trace, llm, options, pageScreenshot, annotatedPageScreenshot, actions, disableSkills, useHints = false, }) {
20
- const promptSpan = trace?.span({ name: "master-agent-prompt" });
19
+ async function getNextAction({ task, executedActions, failedActions, pageUrl, trace, llm, options, pageScreenshot, annotatedPageScreenshot, actions, disableSkills, useHints = false, }) {
20
+ const nextActionSpan = trace?.span({
21
+ name: "master-agent-next-action",
22
+ input: {
23
+ task,
24
+ executedActions,
25
+ failedActions,
26
+ pageUrl,
27
+ options,
28
+ pageScreenshot,
29
+ annotatedPageScreenshot,
30
+ disableSkills,
31
+ useHints,
32
+ skills: skill_1.testCaseSkills.getAvailableSkills(),
33
+ },
34
+ });
35
+ const promptSpan = nextActionSpan?.span({
36
+ name: "master-agent-prompt",
37
+ });
21
38
  const promptMessages = await (0, llm_1.getPrompt)("test-gen", {
22
39
  task,
23
40
  failedActions: failedActions.map((a) => a).join("\n"),
24
41
  executedActions: executedActions.map((a) => a).join("\n"),
25
- pageUrl: page.url(),
42
+ pageUrl,
26
43
  }, useHints ? 16 : 14);
27
44
  // assuming there is only one user message in the prompt. if there is a change in langfuse prompt format, this will need to be updated
28
45
  const userMessage = promptMessages.filter((m) => m.role === "user")[0];
@@ -43,7 +60,7 @@ async function getNextAction({ task, executedActions, failedActions, page, trace
43
60
  {
44
61
  type: "image_url",
45
62
  image_url: {
46
- url: (0, vision_1.imageFormatForProvider)(options.modelProvider || constants_1.DEFAULT_MODEL_PROVIDER, pageScreenshot),
63
+ url: (0, vision_1.imageFormatForProvider)(options?.modelProvider || constants_1.DEFAULT_MODEL_PROVIDER, pageScreenshot),
47
64
  },
48
65
  },
49
66
  ];
@@ -56,20 +73,27 @@ async function getNextAction({ task, executedActions, failedActions, page, trace
56
73
  : actions.getMasterActionSchemas();
57
74
  const tools = [next_task_1.NextTaskAction.schema, ...actionSchemas];
58
75
  promptSpan?.end({ output: { messages } });
76
+ llm =
77
+ llm ||
78
+ new llm_1.LLM({
79
+ provider: options?.modelProvider || constants_1.DEFAULT_MODEL_PROVIDER,
80
+ defaultModel: options?.model || constants_1.DEFAULT_MODEL,
81
+ });
59
82
  const completion = await llm.createChatCompletion({
60
83
  messages,
61
84
  modelParameters: {
62
85
  ...constants_1.DEFAULT_MODEL_PARAMETERS,
63
- ...options.modelParameters,
86
+ ...options?.modelParameters,
64
87
  tool_choice: "required",
65
88
  temperature: 1,
66
89
  },
67
- trace,
90
+ trace: nextActionSpan,
68
91
  traceName: "master-agent-llm",
69
92
  // @ts-ignore
70
93
  tools,
71
94
  });
72
95
  const toolCall = completion?.tool_calls?.[0];
96
+ nextActionSpan?.end({ output: toolCall });
73
97
  return toolCall;
74
98
  }
75
99
  exports.getNextAction = getNextAction;
@@ -80,7 +104,7 @@ async function createTestUsingMasterAgent({ task, page, testCase, options, }) {
80
104
  const testgenUpdatesReporter = new reporter_1.TestGenUpdatesReporter();
81
105
  // add timeout for the page to settle in
82
106
  await page.waitForTimeout(3000);
83
- const trace = llm_1.langfuseInstance.trace({
107
+ const trace = llm_1.langfuseInstance?.trace({
84
108
  name: "test-generator",
85
109
  id: crypto.randomUUID(),
86
110
  version: (0, session_1.getSessionDetails)().version,
@@ -93,9 +117,11 @@ async function createTestUsingMasterAgent({ task, page, testCase, options, }) {
93
117
  options.metadata?.environment,
94
118
  ].filter((s) => !!s),
95
119
  });
96
- void testgenUpdatesReporter.sendMessage(`Starting master agent. [view trace](${trace.getTraceUrl()})`);
97
- logger.log(`Starting master agent: ${trace.getTraceUrl()}`);
98
- void testgenUpdatesReporter.sendAgentTraceUrl(trace.getTraceUrl());
120
+ if (trace) {
121
+ void testgenUpdatesReporter.sendMessage(`Starting master agent. [view trace](${trace?.getTraceUrl()})`);
122
+ logger.log(`Starting master agent: ${trace?.getTraceUrl()}`);
123
+ void testgenUpdatesReporter.sendAgentTraceUrl(trace.getTraceUrl());
124
+ }
99
125
  const llm = new llm_1.LLM({
100
126
  trace,
101
127
  provider: options.modelProvider || constants_1.DEFAULT_MODEL_PROVIDER,
@@ -113,7 +139,7 @@ async function createTestUsingMasterAgent({ task, page, testCase, options, }) {
113
139
  skill_1.testCaseSkills.updateSkills(skills);
114
140
  const actions = new actions_1.PlaywrightActions(page);
115
141
  await (0, utils_1.injectPwLocatorGenerator)(page);
116
- trace.update({ input: { task } });
142
+ trace?.update({ input: { task } });
117
143
  let isGivenTaskDone = false;
118
144
  const masterAgentActions = [];
119
145
  let failedActions = [];
@@ -122,7 +148,7 @@ async function createTestUsingMasterAgent({ task, page, testCase, options, }) {
122
148
  if (await (0, session_1.shouldStopSession)()) {
123
149
  break;
124
150
  }
125
- const masterAgentSpan = trace.span({
151
+ const masterAgentSpan = trace?.span({
126
152
  name: "master-agent",
127
153
  input: {
128
154
  task,
@@ -132,7 +158,6 @@ async function createTestUsingMasterAgent({ task, page, testCase, options, }) {
132
158
  });
133
159
  if (masterAgentActions.length > 0) {
134
160
  const verificationAgentResp = await (0, verification_1.verificationAgent)({
135
- llm,
136
161
  trace: masterAgentSpan,
137
162
  task,
138
163
  conversation: ["Successfully executed actions", ...masterAgentActions],
@@ -190,7 +215,7 @@ async function createTestUsingMasterAgent({ task, page, testCase, options, }) {
190
215
  task,
191
216
  executedActions: masterAgentActions,
192
217
  failedActions,
193
- page,
218
+ pageUrl: page.url(),
194
219
  trace: masterAgentSpan,
195
220
  llm,
196
221
  options,
@@ -205,7 +230,7 @@ async function createTestUsingMasterAgent({ task, page, testCase, options, }) {
205
230
  }
206
231
  if (toolCall) {
207
232
  const args = JSON.parse(toolCall.function.arguments);
208
- const masterAgentActionSpan = masterAgentSpan.span({
233
+ const masterAgentActionSpan = masterAgentSpan?.span({
209
234
  name: "master-agent-action",
210
235
  });
211
236
  try {
@@ -257,7 +282,7 @@ async function createTestUsingMasterAgent({ task, page, testCase, options, }) {
257
282
  // resetting error count as there is a successful action
258
283
  failedActions = [];
259
284
  masterAgentActions.push(output.action);
260
- masterAgentActionSpan.end({
285
+ masterAgentActionSpan?.end({
261
286
  input: {
262
287
  action: output.action,
263
288
  reason: output.reason,
@@ -275,7 +300,7 @@ async function createTestUsingMasterAgent({ task, page, testCase, options, }) {
275
300
  logger.log("Disabling skill usage for next retry");
276
301
  disableSkills = true;
277
302
  }
278
- masterAgentActionSpan.end({
303
+ masterAgentActionSpan?.end({
279
304
  input: {
280
305
  action: output.action,
281
306
  reason: output.reason,
@@ -296,19 +321,21 @@ async function createTestUsingMasterAgent({ task, page, testCase, options, }) {
296
321
  }
297
322
  }
298
323
  }
299
- trace.update({ input: { task }, output: { output } });
324
+ trace?.update({ input: { task }, output: { output } });
300
325
  await testGenSnapshotUpdatePromise;
301
326
  if (testGenAnnotatedSnapshotUpdatePromise) {
302
327
  await testGenAnnotatedSnapshotUpdatePromise;
303
328
  }
304
- masterAgentSpan.end({
329
+ masterAgentSpan?.end({
305
330
  output: { action: output.action, reason: output.reason },
306
331
  });
307
332
  }
308
333
  const { code, importPaths } = actions.generateCode();
309
- trace.update({ input: { task }, output: { code } });
334
+ trace?.update({ input: { task }, output: { code } });
310
335
  logger.success("Successfully generated code for the given task");
311
- await testgenUpdatesReporter.sendMessage(`Successfully generated code for the given task. \n View [trace](${trace.getTraceUrl()})`);
336
+ if (trace) {
337
+ await testgenUpdatesReporter.sendMessage(`Successfully generated code for the given task. \n View [trace](${trace.getTraceUrl()})`);
338
+ }
312
339
  return {
313
340
  code,
314
341
  importPaths,
@@ -4,7 +4,7 @@ import { Page } from "playwright";
4
4
  import { BrowsingAgentOptions } from "../browsing";
5
5
  export declare const getUserMessageWithForHints: ({ userMessage, options, pageScreenshot, annotatedPageScreenshot, }: {
6
6
  userMessage: OpenAI.ChatCompletionUserMessageParam;
7
- options: BrowsingAgentOptions;
7
+ options?: BrowsingAgentOptions | undefined;
8
8
  pageScreenshot: string;
9
9
  annotatedPageScreenshot: string;
10
10
  }) => string | OpenAI.ChatCompletionContentPart[];
@@ -1 +1 @@
1
- {"version":3,"file":"with-hints.d.ts","sourceRoot":"","sources":["../../../src/agent/master/with-hints.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,MAAM,mBAAmB,CAAC;AAExC,OAAO,MAAM,MAAM,QAAQ,CAAC;AAC5B,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAIlC,OAAO,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAEnD,eAAO,MAAM,0BAA0B;iBAMxB,OAAO,8BAA8B;aACzC,oBAAoB;oBACb,MAAM;6BACG,MAAM;MAC7B,MAAM,GAAG,OAAO,yBAAyB,EAiC5C,CAAC;AAEF,eAAO,MAAM,gBAAgB;6BAMF;QACvB,MAAM,EAAE,MAAM,CAAC;QACf,iBAAiB,CAAC,EAAE,MAAM,CAAC;KAC5B;0BACqB,OAAO,MAAM,EAAE,GAAG,CAAC;UACnC,IAAI;SACL,GAAG;MACN,QAAQ;IACV,sBAAsB,EAAE,OAAO,CAAC;IAChC,wBAAwB,EAAE,OAAO,qBAAqB,GAAG,SAAS,CAAC;CACpE,CAgFA,CAAC"}
1
+ {"version":3,"file":"with-hints.d.ts","sourceRoot":"","sources":["../../../src/agent/master/with-hints.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,MAAM,mBAAmB,CAAC;AAExC,OAAO,MAAM,MAAM,QAAQ,CAAC;AAC5B,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAIlC,OAAO,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAEnD,eAAO,MAAM,0BAA0B;iBAMxB,OAAO,8BAA8B;;oBAElC,MAAM;6BACG,MAAM;MAC7B,MAAM,GAAG,OAAO,yBAAyB,EAiC5C,CAAC;AAEF,eAAO,MAAM,gBAAgB;6BAMF;QACvB,MAAM,EAAE,MAAM,CAAC;QACf,iBAAiB,CAAC,EAAE,MAAM,CAAC;KAC5B;0BACqB,OAAO,MAAM,EAAE,GAAG,CAAC;UACnC,IAAI;SACL,GAAG;MACN,QAAQ;IACV,sBAAsB,EAAE,OAAO,CAAC;IAChC,wBAAwB,EAAE,OAAO,qBAAqB,GAAG,SAAS,CAAC;CACpE,CAgFA,CAAC"}
@@ -17,7 +17,7 @@ const getUserMessageWithForHints = ({ userMessage, options, pageScreenshot, anno
17
17
  {
18
18
  type: "image_url",
19
19
  image_url: {
20
- url: (0, vision_1.imageFormatForProvider)(options.modelProvider || constants_1.DEFAULT_MODEL_PROVIDER, pageScreenshot),
20
+ url: (0, vision_1.imageFormatForProvider)(options?.modelProvider || constants_1.DEFAULT_MODEL_PROVIDER, pageScreenshot),
21
21
  },
22
22
  },
23
23
  {
@@ -27,7 +27,7 @@ const getUserMessageWithForHints = ({ userMessage, options, pageScreenshot, anno
27
27
  {
28
28
  type: "image_url",
29
29
  image_url: {
30
- url: (0, vision_1.imageFormatForProvider)(options.modelProvider || constants_1.DEFAULT_MODEL_PROVIDER, annotatedPageScreenshot),
30
+ url: (0, vision_1.imageFormatForProvider)(options?.modelProvider || constants_1.DEFAULT_MODEL_PROVIDER, annotatedPageScreenshot),
31
31
  },
32
32
  },
33
33
  ];
@@ -1,9 +1,8 @@
1
- import { LLM, TraceClient } from "@empiricalrun/llm";
1
+ import { TraceClient } from "@empiricalrun/llm";
2
2
  /**
3
3
  * This agent is used to verify whether the task is done basis the conversation history
4
4
  */
5
- export declare function verificationAgent({ llm, trace, task, conversation, }: {
6
- llm: LLM;
5
+ export declare function verificationAgent({ trace, task, conversation, }: {
7
6
  trace?: TraceClient;
8
7
  conversation: string[];
9
8
  task: string;
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/verification/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAa,GAAG,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhE;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,EACtC,GAAG,EACH,KAAK,EACL,IAAI,EACJ,YAAY,GACb,EAAE;IACD,GAAG,EAAE,GAAG,CAAC;IACT,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,IAAI,EAAE,MAAM,CAAC;CACd;;;GAkDA"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/verification/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAkB,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhE;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,EACtC,KAAK,EACL,IAAI,EACJ,YAAY,GACb,EAAE;IACD,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,IAAI,EAAE,MAAM,CAAC;CACd;;;GA+EA"}
@@ -5,13 +5,21 @@ const llm_1 = require("@empiricalrun/llm");
5
5
  /**
6
6
  * This agent is used to verify whether the task is done basis the conversation history
7
7
  */
8
- async function verificationAgent({ llm, trace, task, conversation, }) {
8
+ async function verificationAgent({ trace, task, conversation, }) {
9
+ const verificationAgentSpan = trace?.span({
10
+ name: "verification-agent",
11
+ input: {
12
+ task,
13
+ conversation,
14
+ },
15
+ });
9
16
  const messages = await (0, llm_1.getPrompt)("agent-steps-verification", {
10
17
  task,
11
18
  conversation: conversation.join("\n"),
12
- });
19
+ }, 4);
20
+ const llm = new llm_1.LLM({ provider: "openai" });
13
21
  const response = await llm.createChatCompletion({
14
- trace,
22
+ trace: verificationAgentSpan,
15
23
  traceName: "verification-agent-llm",
16
24
  model: "gpt-4o",
17
25
  messages,
@@ -24,13 +32,21 @@ async function verificationAgent({ llm, trace, task, conversation, }) {
24
32
  parameters: {
25
33
  type: "object",
26
34
  properties: {
27
- isDone: {
28
- type: "boolean",
29
- description: "whether the task is done",
35
+ actions: {
36
+ type: "string",
37
+ description: "actions extracted from task",
38
+ },
39
+ successful_actions: {
40
+ type: "string",
41
+ description: "successful actions mentioned in the conversation",
30
42
  },
31
43
  reason: {
32
44
  type: "string",
33
- description: "reason for declaring the task is complete",
45
+ description: "reasoning for identification of task status",
46
+ },
47
+ isDone: {
48
+ type: "boolean",
49
+ description: "whether the task is done",
34
50
  },
35
51
  },
36
52
  required: ["isDone", "reason"],
@@ -40,19 +56,28 @@ async function verificationAgent({ llm, trace, task, conversation, }) {
40
56
  ],
41
57
  modelParameters: {
42
58
  tool_choice: "required",
59
+ temperature: 0.5,
43
60
  },
44
61
  });
45
62
  const toolCallResp = (response?.tool_calls || [])[0];
46
63
  if (toolCallResp) {
47
64
  const toolCall = JSON.parse(toolCallResp.function.arguments);
48
- return {
65
+ const output = {
49
66
  isDone: toolCall.isDone,
50
67
  reason: toolCall.reason,
51
68
  };
69
+ verificationAgentSpan?.end({
70
+ output,
71
+ });
72
+ return output;
52
73
  }
53
- return {
74
+ const output = {
54
75
  isDone: false,
55
76
  reason: "LLM failed to generate a valid response",
56
77
  };
78
+ verificationAgentSpan?.end({
79
+ output,
80
+ });
81
+ return output;
57
82
  }
58
83
  exports.verificationAgent = verificationAgent;
package/dist/bin/index.js CHANGED
@@ -46,7 +46,7 @@ async function runAgent(testGenConfig) {
46
46
  }
47
47
  let agent = testGenConfig.options?.agent;
48
48
  const session = (0, session_1.getSessionDetails)();
49
- const trace = llm_1.langfuseInstance.trace({
49
+ const trace = llm_1.langfuseInstance?.trace({
50
50
  name: "generate-test",
51
51
  id: crypto.randomUUID(),
52
52
  release: session.version,
@@ -55,11 +55,13 @@ async function runAgent(testGenConfig) {
55
55
  testGenConfig.options?.metadata.environment || "",
56
56
  ].filter((s) => !!s),
57
57
  });
58
- try {
59
- await new reporter_1.TestGenUpdatesReporter().sendAgentTraceUrl(trace.getTraceUrl());
60
- }
61
- catch (e) {
62
- console.warn("Failed to send trace url as test gen update", e);
58
+ if (trace) {
59
+ try {
60
+ await new reporter_1.TestGenUpdatesReporter().sendAgentTraceUrl(trace.getTraceUrl());
61
+ }
62
+ catch (e) {
63
+ console.warn("Failed to send trace url as test gen update", e);
64
+ }
63
65
  }
64
66
  if (!agent || agent === "auto") {
65
67
  agent = await resolveAgentUsingTask({
@@ -0,0 +1,4 @@
1
+ import { EvaluateFn } from "./type";
2
+ declare const fetchSkillsAgentEvaluator: EvaluateFn;
3
+ export default fetchSkillsAgentEvaluator;
4
+ //# sourceMappingURL=fetch-pom-skills-agent.evals.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fetch-pom-skills-agent.evals.d.ts","sourceRoot":"","sources":["../../src/evals/fetch-pom-skills-agent.evals.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AAEpC,QAAA,MAAM,yBAAyB,EAAE,UAiChC,CAAC;AAEF,eAAe,yBAAyB,CAAC"}
@@ -0,0 +1,36 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ const skills_retriever_1 = require("../agent/codegen/skills-retriever");
4
+ const fetchSkillsAgentEvaluator = async ({ item, trace }) => {
5
+ const { testCase, pomFiles } = item.input;
6
+ const output = await (0, skills_retriever_1.fetchPomSkills)({
7
+ testCase,
8
+ pomFiles,
9
+ trace,
10
+ });
11
+ if (item.expectedOutput.length === 0 && output.length === 0) {
12
+ return {
13
+ scores: [
14
+ {
15
+ name: "equality",
16
+ value: 1,
17
+ },
18
+ ],
19
+ output,
20
+ };
21
+ }
22
+ return {
23
+ scores: [
24
+ {
25
+ name: "equality",
26
+ value: output.some((o) => {
27
+ return item.expectedOutput.some((e) => e.usageExample === o.usageExample);
28
+ })
29
+ ? 1
30
+ : 0,
31
+ },
32
+ ],
33
+ output,
34
+ };
35
+ };
36
+ exports.default = fetchSkillsAgentEvaluator;
@@ -0,0 +1,4 @@
1
+ import { EvaluateFn } from "./type";
2
+ export declare const inferMasterOrCodeAgentEvaluate: EvaluateFn;
3
+ export default inferMasterOrCodeAgentEvaluate;
4
+ //# sourceMappingURL=infer-master-or-code-agent.evals.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"infer-master-or-code-agent.evals.d.ts","sourceRoot":"","sources":["../../src/evals/infer-master-or-code-agent.evals.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AAEpC,eAAO,MAAM,8BAA8B,EAAE,UAkB5C,CAAC;AAEF,eAAe,8BAA8B,CAAC"}
@@ -0,0 +1,22 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.inferMasterOrCodeAgentEvaluate = void 0;
4
+ const infer_agent_1 = require("../agent/infer-agent");
5
+ const inferMasterOrCodeAgentEvaluate = async ({ item, trace, }) => {
6
+ const { task } = item.input;
7
+ const { response } = await (0, infer_agent_1.inferAgentBasedTask)({
8
+ task,
9
+ trace,
10
+ });
11
+ return {
12
+ scores: [
13
+ {
14
+ name: "equality",
15
+ value: item.expectedOutput === response ? 1 : 0,
16
+ },
17
+ ],
18
+ output: response,
19
+ };
20
+ };
21
+ exports.inferMasterOrCodeAgentEvaluate = inferMasterOrCodeAgentEvaluate;
22
+ exports.default = exports.inferMasterOrCodeAgentEvaluate;
@@ -0,0 +1,4 @@
1
+ import { EvaluateFn } from "./type";
2
+ export declare const masterGetNextActionEvaluator: EvaluateFn;
3
+ export default masterGetNextActionEvaluator;
4
+ //# sourceMappingURL=master-agent.evals.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"master-agent.evals.d.ts","sourceRoot":"","sources":["../../src/evals/master-agent.evals.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AAEpC,eAAO,MAAM,4BAA4B,EAAE,UA0C1C,CAAC;AAEF,eAAe,4BAA4B,CAAC"}
@@ -0,0 +1,36 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.masterGetNextActionEvaluator = void 0;
4
+ const actions_1 = require("../actions");
5
+ const skill_1 = require("../actions/skill");
6
+ const run_1 = require("../agent/master/run");
7
+ const masterGetNextActionEvaluator = async ({ item, trace, }) => {
8
+ const { task, executedActions, failedActions, pageUrl, options, pageScreenshot, annotatedPageScreenshot, disableSkills, useHints, skills = [], } = item.input;
9
+ const page = {};
10
+ skill_1.testCaseSkills.updateSkills(skills);
11
+ const actions = new actions_1.PlaywrightActions(page);
12
+ const output = await (0, run_1.getNextAction)({
13
+ task,
14
+ executedActions,
15
+ failedActions,
16
+ pageUrl,
17
+ trace,
18
+ options,
19
+ pageScreenshot,
20
+ annotatedPageScreenshot,
21
+ actions,
22
+ disableSkills,
23
+ useHints,
24
+ });
25
+ return {
26
+ scores: [
27
+ {
28
+ name: "action_correctness",
29
+ value: item.expectedOutput.function.name === output?.function.name ? 1 : 0,
30
+ },
31
+ ],
32
+ output,
33
+ };
34
+ };
35
+ exports.masterGetNextActionEvaluator = masterGetNextActionEvaluator;
36
+ exports.default = exports.masterGetNextActionEvaluator;
@@ -0,0 +1,12 @@
1
+ import { TraceClient } from "@empiricalrun/llm";
2
+ export type EvaluateFn = ({ trace, item, }: {
3
+ trace: TraceClient;
4
+ item: any;
5
+ }) => Promise<{
6
+ output: any;
7
+ scores: {
8
+ name: string;
9
+ value: number;
10
+ }[];
11
+ }>;
12
+ //# sourceMappingURL=type.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"type.d.ts","sourceRoot":"","sources":["../../src/evals/type.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhD,MAAM,MAAM,UAAU,GAAG,CAAC,EACxB,KAAK,EACL,IAAI,GACL,EAAE;IACD,KAAK,EAAE,WAAW,CAAC;IACnB,IAAI,EAAE,GAAG,CAAC;CACX,KAAK,OAAO,CAAC;IACZ,MAAM,EAAE,GAAG,CAAC;IACZ,MAAM,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,EAAE,CAAC;CAC3C,CAAC,CAAC"}
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
@@ -0,0 +1,4 @@
1
+ import { EvaluateFn } from "./type";
2
+ export declare const verifierAgentEvaluate: EvaluateFn;
3
+ export default verifierAgentEvaluate;
4
+ //# sourceMappingURL=verification-agent.evals.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"verification-agent.evals.d.ts","sourceRoot":"","sources":["../../src/evals/verification-agent.evals.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AAEpC,eAAO,MAAM,qBAAqB,EAAE,UAgBnC,CAAC;AAEF,eAAe,qBAAqB,CAAC"}
@@ -0,0 +1,23 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.verifierAgentEvaluate = void 0;
4
+ const verification_1 = require("../agent/verification");
5
+ const verifierAgentEvaluate = async ({ item, trace }) => {
6
+ const { conversation = [], task = "" } = item.input;
7
+ const output = await (0, verification_1.verificationAgent)({
8
+ conversation,
9
+ trace,
10
+ task,
11
+ });
12
+ return {
13
+ scores: [
14
+ {
15
+ name: "equality",
16
+ value: item.expectedOutput.isDone === output.isDone ? 1 : 0,
17
+ },
18
+ ],
19
+ output,
20
+ };
21
+ };
22
+ exports.verifierAgentEvaluate = verifierAgentEvaluate;
23
+ exports.default = exports.verifierAgentEvaluate;
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/session/index.ts"],"names":[],"mappings":"AAeA,iBAAS,iBAAiB;;;;EAMzB;AAED,wBAAgB,iBAAiB,CAAC,EAChC,SAAS,EACT,YAAY,GACb,EAAE;IACD,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;CACtB,QAGA;AAED,wBAAsB,iBAAiB,qBAGtC;AAED,wBAAsB,eAAe,0DAkBpC;AAED,wBAAsB,UAAU,kBAkB/B;AAED,OAAO,EAAE,iBAAiB,EAAE,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/session/index.ts"],"names":[],"mappings":"AAgBA,iBAAS,iBAAiB;;;;EAMzB;AAED,wBAAgB,iBAAiB,CAAC,EAChC,SAAS,EACT,YAAY,GACb,EAAE;IACD,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;CACtB,QAGA;AAED,wBAAsB,iBAAiB,qBAGtC;AAED,wBAAsB,eAAe,0DAqBpC;AAED,wBAAsB,UAAU,kBAqB/B;AAED,OAAO,EAAE,iBAAiB,EAAE,CAAC"}
@@ -10,7 +10,8 @@ const sessionDetails = {
10
10
  version: package_json_1.default.version,
11
11
  generationId: undefined,
12
12
  };
13
- const DASHBOARD_DOMAIN = process.env.DASHBOARD_DOMAIN || "https://dash.empirical.run";
13
+ const DASHBOARD_DOMAIN = process.env.DASHBOARD_DOMAIN ||
14
+ (process.env.CI === "true" ? "https://dash.empirical.run" : "");
14
15
  function getSessionDetails() {
15
16
  return {
16
17
  generationId: sessionDetails.generationId,
@@ -30,6 +31,9 @@ async function shouldStopSession() {
30
31
  }
31
32
  exports.shouldStopSession = shouldStopSession;
32
33
  async function getSessionState() {
34
+ if (!DASHBOARD_DOMAIN) {
35
+ return "started";
36
+ }
33
37
  const apiPath = `${DASHBOARD_DOMAIN}/api/sessions/${sessionDetails.sessionId}/generations/${sessionDetails.generationId}/state`;
34
38
  const response = await fetch(apiPath, {
35
39
  method: "GET",
@@ -44,6 +48,9 @@ async function getSessionState() {
44
48
  }
45
49
  exports.getSessionState = getSessionState;
46
50
  async function endSession() {
51
+ if (!DASHBOARD_DOMAIN) {
52
+ return;
53
+ }
47
54
  const apiPath = `${DASHBOARD_DOMAIN}/api/sessions/${sessionDetails.sessionId}/generations/${sessionDetails.generationId}/state`;
48
55
  try {
49
56
  await fetch(apiPath, {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@empiricalrun/test-gen",
3
- "version": "0.34.4",
3
+ "version": "0.35.0",
4
4
  "publishConfig": {
5
5
  "registry": "https://registry.npmjs.org/",
6
6
  "access": "public"
@@ -44,7 +44,7 @@
44
44
  "ts-morph": "^23.0.0",
45
45
  "tsx": "^4.16.2",
46
46
  "typescript": "^5.3.3",
47
- "@empiricalrun/llm": "^0.9.19",
47
+ "@empiricalrun/llm": "^0.9.21",
48
48
  "@empiricalrun/r2-uploader": "^0.3.6",
49
49
  "@empiricalrun/reporter": "^0.21.2"
50
50
  },
@@ -1,2 +0,0 @@
1
- export {};
2
- //# sourceMappingURL=infer-master-code.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"infer-master-code.d.ts","sourceRoot":"","sources":["../../src/evals/infer-master-code.ts"],"names":[],"mappings":""}
@@ -1,18 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- const llm_1 = require("@empiricalrun/llm");
4
- const infer_agent_1 = require("../agent/infer-agent");
5
- const datasetName = "infer-master-or-code-agent";
6
- (async function main() {
7
- const dataset = await llm_1.langfuseInstance.getDataset(datasetName);
8
- const runName = `${datasetName}-${Date.now()}`;
9
- for (const item of dataset.items) {
10
- const { response, trace } = await (0, infer_agent_1.inferAgentBasedTask)(item.input);
11
- await item.link(trace, runName, {});
12
- trace?.score({
13
- name: "equality",
14
- value: item.expectedOutput === response ? 1 : 0, // score value
15
- });
16
- }
17
- await llm_1.langfuseInstance.flushAsync();
18
- })();