@empiricalrun/test-gen 0.34.3 → 0.34.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,20 @@
1
1
  # @empiricalrun/test-gen
2
2
 
3
+ ## 0.34.5
4
+
5
+ ### Patch Changes
6
+
7
+ - dc5718a: feat: add support for evals
8
+ - Updated dependencies [06cf0d8]
9
+ - @empiricalrun/llm@0.9.20
10
+
11
+ ## 0.34.4
12
+
13
+ ### Patch Changes
14
+
15
+ - Updated dependencies [2dafa69]
16
+ - @empiricalrun/llm@0.9.19
17
+
3
18
  ## 0.34.3
4
19
 
5
20
  ### Patch Changes
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/browsing/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAElC,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAIhD,OAAO,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAMnD,MAAM,MAAM,oBAAoB,GAAG,OAAO,CAAC,oBAAoB,CAAC,GAAG;IACjE,YAAY,CAAC,EAAE;QACb,iBAAiB,CAAC,EAAE,MAAM,EAAE,CAAC;KAC9B,CAAC;CACH,CAAC;AAEF,wBAAsB,6BAA6B,CAAC,EAClD,KAAK,EACL,MAAM,EACN,MAAM,EACN,IAAI,EACJ,OAAO,EACP,GAAG,EACH,OAAO,GACR,EAAE;IACD,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,WAAW,CAAC;IACnB,MAAM,EAAE,YAAY,CAAC;IACrB,IAAI,EAAE,IAAI,CAAC;IACX,OAAO,EAAE,oBAAoB,CAAC;IAC9B,GAAG,EAAE,GAAG,CAAC;IACT,OAAO,EAAE,iBAAiB,CAAC;CAC5B,iBAyIA"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/browsing/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAElC,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAClD,OAAO,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAIhD,OAAO,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAMnD,MAAM,MAAM,oBAAoB,GAAG,OAAO,CAAC,oBAAoB,CAAC,GAAG;IACjE,YAAY,CAAC,EAAE;QACb,iBAAiB,CAAC,EAAE,MAAM,EAAE,CAAC;KAC9B,CAAC;CACH,CAAC;AAEF,wBAAsB,6BAA6B,CAAC,EAClD,KAAK,EACL,MAAM,EACN,MAAM,EACN,IAAI,EACJ,OAAO,EACP,GAAG,EACH,OAAO,GACR,EAAE;IACD,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,WAAW,CAAC;IACnB,MAAM,EAAE,YAAY,CAAC;IACrB,IAAI,EAAE,IAAI,CAAC;IACX,OAAO,EAAE,oBAAoB,CAAC;IAC9B,GAAG,EAAE,GAAG,CAAC;IACT,OAAO,EAAE,iBAAiB,CAAC;CAC5B,iBAwIA"}
@@ -38,7 +38,6 @@ async function executeTaskUsingBrowsingAgent({ trace, action, logger, page, opti
38
38
  .map((a) => a.action);
39
39
  if (successfulActions.length > 0) {
40
40
  const verificationAgentResp = await (0, verification_1.verificationAgent)({
41
- llm,
42
41
  trace: browsingAgentSpan,
43
42
  task: action,
44
43
  conversation: ["Successfully executed actions", ...successfulActions],
@@ -1 +1 @@
1
- {"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/agent/master/run.ts"],"names":[],"mappings":"AAAA,OAAO,EAGL,GAAG,EACH,WAAW,EACZ,MAAM,mBAAmB,CAAC;AAG3B,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAElC,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAYlD,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AACvC,OAAO,EACL,oBAAoB,EAErB,MAAM,aAAa,CAAC;AAQrB,wBAAsB,aAAa,CAAC,EAClC,IAAI,EACJ,eAAe,EACf,aAAa,EACb,IAAI,EACJ,KAAK,EACL,GAAG,EACH,OAAO,EACP,cAAc,EACd,uBAAuB,EACvB,OAAO,EACP,aAAa,EACb,QAAgB,GACjB,EAAE;IACD,IAAI,EAAE,MAAM,CAAC;IACb,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,aAAa,EAAE,GAAG,EAAE,CAAC;IACrB,IAAI,EAAE,IAAI,CAAC;IACX,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,GAAG,EAAE,GAAG,CAAC;IACT,OAAO,EAAE,oBAAoB,CAAC;IAC9B,cAAc,EAAE,MAAM,CAAC;IACvB,uBAAuB,CAAC,EAAE,MAAM,CAAC;IACjC,OAAO,EAAE,iBAAiB,CAAC;IAC3B,aAAa,EAAE,OAAO,CAAC;IACvB,QAAQ,EAAE,OAAO,CAAC;CACnB,2FAgEA;AAGD,wBAAsB,0BAA0B,CAAC,EAC/C,IAAI,EACJ,IAAI,EACJ,QAAQ,EACR,OAAO,GACR,EAAE;IACD,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,IAAI,CAAC;IACX,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,EAAE,oBAAoB,CAAC;CAC/B;;;GA8QA"}
1
+ {"version":3,"file":"run.d.ts","sourceRoot":"","sources":["../../../src/agent/master/run.ts"],"names":[],"mappings":"AAAA,OAAO,EAGL,GAAG,EACH,WAAW,EACZ,MAAM,mBAAmB,CAAC;AAG3B,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAElC,OAAO,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC;AAYlD,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AACvC,OAAO,EACL,oBAAoB,EAErB,MAAM,aAAa,CAAC;AAQrB,wBAAsB,aAAa,CAAC,EAClC,IAAI,EACJ,eAAe,EACf,aAAa,EACb,IAAI,EACJ,KAAK,EACL,GAAG,EACH,OAAO,EACP,cAAc,EACd,uBAAuB,EACvB,OAAO,EACP,aAAa,EACb,QAAgB,GACjB,EAAE;IACD,IAAI,EAAE,MAAM,CAAC;IACb,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,aAAa,EAAE,GAAG,EAAE,CAAC;IACrB,IAAI,EAAE,IAAI,CAAC;IACX,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,GAAG,EAAE,GAAG,CAAC;IACT,OAAO,EAAE,oBAAoB,CAAC;IAC9B,cAAc,EAAE,MAAM,CAAC;IACvB,uBAAuB,CAAC,EAAE,MAAM,CAAC;IACjC,OAAO,EAAE,iBAAiB,CAAC;IAC3B,aAAa,EAAE,OAAO,CAAC;IACvB,QAAQ,EAAE,OAAO,CAAC;CACnB,2FAgEA;AAGD,wBAAsB,0BAA0B,CAAC,EAC/C,IAAI,EACJ,IAAI,EACJ,QAAQ,EACR,OAAO,GACR,EAAE;IACD,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,IAAI,CAAC;IACX,QAAQ,EAAE,QAAQ,CAAC;IACnB,OAAO,EAAE,oBAAoB,CAAC;CAC/B;;;GA6QA"}
@@ -132,7 +132,6 @@ async function createTestUsingMasterAgent({ task, page, testCase, options, }) {
132
132
  });
133
133
  if (masterAgentActions.length > 0) {
134
134
  const verificationAgentResp = await (0, verification_1.verificationAgent)({
135
- llm,
136
135
  trace: masterAgentSpan,
137
136
  task,
138
137
  conversation: ["Successfully executed actions", ...masterAgentActions],
@@ -1,9 +1,8 @@
1
- import { LLM, TraceClient } from "@empiricalrun/llm";
1
+ import { TraceClient } from "@empiricalrun/llm";
2
2
  /**
3
3
  * This agent is used to verify whether the task is done basis the conversation history
4
4
  */
5
- export declare function verificationAgent({ llm, trace, task, conversation, }: {
6
- llm: LLM;
5
+ export declare function verificationAgent({ trace, task, conversation, }: {
7
6
  trace?: TraceClient;
8
7
  conversation: string[];
9
8
  task: string;
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/verification/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAa,GAAG,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhE;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,EACtC,GAAG,EACH,KAAK,EACL,IAAI,EACJ,YAAY,GACb,EAAE;IACD,GAAG,EAAE,GAAG,CAAC;IACT,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,IAAI,EAAE,MAAM,CAAC;CACd;;;GAkDA"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/verification/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAkB,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhE;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,EACtC,KAAK,EACL,IAAI,EACJ,YAAY,GACb,EAAE;IACD,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,IAAI,EAAE,MAAM,CAAC;CACd;;;GA+EA"}
@@ -5,13 +5,21 @@ const llm_1 = require("@empiricalrun/llm");
5
5
  /**
6
6
  * This agent is used to verify whether the task is done basis the conversation history
7
7
  */
8
- async function verificationAgent({ llm, trace, task, conversation, }) {
8
+ async function verificationAgent({ trace, task, conversation, }) {
9
+ const verificationAgentSpan = trace?.span({
10
+ name: "verification-agent",
11
+ input: {
12
+ task,
13
+ conversation,
14
+ },
15
+ });
9
16
  const messages = await (0, llm_1.getPrompt)("agent-steps-verification", {
10
17
  task,
11
18
  conversation: conversation.join("\n"),
12
- });
19
+ }, 4);
20
+ const llm = new llm_1.LLM({ provider: "openai" });
13
21
  const response = await llm.createChatCompletion({
14
- trace,
22
+ trace: verificationAgentSpan,
15
23
  traceName: "verification-agent-llm",
16
24
  model: "gpt-4o",
17
25
  messages,
@@ -24,13 +32,21 @@ async function verificationAgent({ llm, trace, task, conversation, }) {
24
32
  parameters: {
25
33
  type: "object",
26
34
  properties: {
27
- isDone: {
28
- type: "boolean",
29
- description: "whether the task is done",
35
+ actions: {
36
+ type: "string",
37
+ description: "actions extracted from task",
38
+ },
39
+ successful_actions: {
40
+ type: "string",
41
+ description: "successful actions mentioned in the conversation",
30
42
  },
31
43
  reason: {
32
44
  type: "string",
33
- description: "reason for declaring the task is complete",
45
+ description: "reasoning for identification of task status",
46
+ },
47
+ isDone: {
48
+ type: "boolean",
49
+ description: "whether the task is done",
34
50
  },
35
51
  },
36
52
  required: ["isDone", "reason"],
@@ -40,19 +56,28 @@ async function verificationAgent({ llm, trace, task, conversation, }) {
40
56
  ],
41
57
  modelParameters: {
42
58
  tool_choice: "required",
59
+ temperature: 0.5,
43
60
  },
44
61
  });
45
62
  const toolCallResp = (response?.tool_calls || [])[0];
46
63
  if (toolCallResp) {
47
64
  const toolCall = JSON.parse(toolCallResp.function.arguments);
48
- return {
65
+ const output = {
49
66
  isDone: toolCall.isDone,
50
67
  reason: toolCall.reason,
51
68
  };
69
+ verificationAgentSpan?.end({
70
+ output,
71
+ });
72
+ return output;
52
73
  }
53
- return {
74
+ const output = {
54
75
  isDone: false,
55
76
  reason: "LLM failed to generate a valid response",
56
77
  };
78
+ verificationAgentSpan?.end({
79
+ output,
80
+ });
81
+ return output;
57
82
  }
58
83
  exports.verificationAgent = verificationAgent;
@@ -0,0 +1,4 @@
1
+ import { EvaluateFn } from "./type";
2
+ export declare const inferMasterOrCodeAgentEvaluate: EvaluateFn;
3
+ export default inferMasterOrCodeAgentEvaluate;
4
+ //# sourceMappingURL=infer-master-or-code-agent.evals.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"infer-master-or-code-agent.evals.d.ts","sourceRoot":"","sources":["../../src/evals/infer-master-or-code-agent.evals.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AAEpC,eAAO,MAAM,8BAA8B,EAAE,UAkB5C,CAAC;AAEF,eAAe,8BAA8B,CAAC"}
@@ -0,0 +1,22 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.inferMasterOrCodeAgentEvaluate = void 0;
4
+ const infer_agent_1 = require("../agent/infer-agent");
5
+ const inferMasterOrCodeAgentEvaluate = async ({ item, trace, }) => {
6
+ const { task } = item.input;
7
+ const { response } = await (0, infer_agent_1.inferAgentBasedTask)({
8
+ task,
9
+ trace,
10
+ });
11
+ return {
12
+ scores: [
13
+ {
14
+ name: "equality",
15
+ value: item.expectedOutput === response ? 1 : 0,
16
+ },
17
+ ],
18
+ output: response,
19
+ };
20
+ };
21
+ exports.inferMasterOrCodeAgentEvaluate = inferMasterOrCodeAgentEvaluate;
22
+ exports.default = exports.inferMasterOrCodeAgentEvaluate;
@@ -0,0 +1,12 @@
1
+ import { TraceClient } from "@empiricalrun/llm";
2
+ export type EvaluateFn = ({ trace, item, }: {
3
+ trace: TraceClient;
4
+ item: any;
5
+ }) => Promise<{
6
+ output: any;
7
+ scores: {
8
+ name: string;
9
+ value: number;
10
+ }[];
11
+ }>;
12
+ //# sourceMappingURL=type.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"type.d.ts","sourceRoot":"","sources":["../../src/evals/type.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhD,MAAM,MAAM,UAAU,GAAG,CAAC,EACxB,KAAK,EACL,IAAI,GACL,EAAE;IACD,KAAK,EAAE,WAAW,CAAC;IACnB,IAAI,EAAE,GAAG,CAAC;CACX,KAAK,OAAO,CAAC;IACZ,MAAM,EAAE,GAAG,CAAC;IACZ,MAAM,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,EAAE,CAAC;CAC3C,CAAC,CAAC"}
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
@@ -0,0 +1,4 @@
1
+ import { EvaluateFn } from "./type";
2
+ export declare const verifierAgentEvaluate: EvaluateFn;
3
+ export default verifierAgentEvaluate;
4
+ //# sourceMappingURL=verification-agent.evals.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"verification-agent.evals.d.ts","sourceRoot":"","sources":["../../src/evals/verification-agent.evals.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AAEpC,eAAO,MAAM,qBAAqB,EAAE,UAgBnC,CAAC;AAEF,eAAe,qBAAqB,CAAC"}
@@ -0,0 +1,23 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.verifierAgentEvaluate = void 0;
4
+ const verification_1 = require("../agent/verification");
5
+ const verifierAgentEvaluate = async ({ item, trace }) => {
6
+ const { conversation = [], task = "" } = item.input;
7
+ const output = await (0, verification_1.verificationAgent)({
8
+ conversation,
9
+ trace,
10
+ task,
11
+ });
12
+ return {
13
+ scores: [
14
+ {
15
+ name: "equality",
16
+ value: item.expectedOutput.isDone === output.isDone ? 1 : 0,
17
+ },
18
+ ],
19
+ output,
20
+ };
21
+ };
22
+ exports.verifierAgentEvaluate = verifierAgentEvaluate;
23
+ exports.default = exports.verifierAgentEvaluate;
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/session/index.ts"],"names":[],"mappings":"AAeA,iBAAS,iBAAiB;;;;EAMzB;AAED,wBAAgB,iBAAiB,CAAC,EAChC,SAAS,EACT,YAAY,GACb,EAAE;IACD,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;CACtB,QAGA;AAED,wBAAsB,iBAAiB,qBAGtC;AAED,wBAAsB,eAAe,0DAkBpC;AAED,wBAAsB,UAAU,kBAkB/B;AAED,OAAO,EAAE,iBAAiB,EAAE,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/session/index.ts"],"names":[],"mappings":"AAgBA,iBAAS,iBAAiB;;;;EAMzB;AAED,wBAAgB,iBAAiB,CAAC,EAChC,SAAS,EACT,YAAY,GACb,EAAE;IACD,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;CACtB,QAGA;AAED,wBAAsB,iBAAiB,qBAGtC;AAED,wBAAsB,eAAe,0DAqBpC;AAED,wBAAsB,UAAU,kBAqB/B;AAED,OAAO,EAAE,iBAAiB,EAAE,CAAC"}
@@ -10,7 +10,8 @@ const sessionDetails = {
10
10
  version: package_json_1.default.version,
11
11
  generationId: undefined,
12
12
  };
13
- const DASHBOARD_DOMAIN = process.env.DASHBOARD_DOMAIN || "https://dash.empirical.run";
13
+ const DASHBOARD_DOMAIN = process.env.DASHBOARD_DOMAIN ||
14
+ (process.env.CI === "true" ? "https://dash.empirical.run" : "");
14
15
  function getSessionDetails() {
15
16
  return {
16
17
  generationId: sessionDetails.generationId,
@@ -30,6 +31,9 @@ async function shouldStopSession() {
30
31
  }
31
32
  exports.shouldStopSession = shouldStopSession;
32
33
  async function getSessionState() {
34
+ if (!DASHBOARD_DOMAIN) {
35
+ return "started";
36
+ }
33
37
  const apiPath = `${DASHBOARD_DOMAIN}/api/sessions/${sessionDetails.sessionId}/generations/${sessionDetails.generationId}/state`;
34
38
  const response = await fetch(apiPath, {
35
39
  method: "GET",
@@ -44,6 +48,9 @@ async function getSessionState() {
44
48
  }
45
49
  exports.getSessionState = getSessionState;
46
50
  async function endSession() {
51
+ if (!DASHBOARD_DOMAIN) {
52
+ return;
53
+ }
47
54
  const apiPath = `${DASHBOARD_DOMAIN}/api/sessions/${sessionDetails.sessionId}/generations/${sessionDetails.generationId}/state`;
48
55
  try {
49
56
  await fetch(apiPath, {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@empiricalrun/test-gen",
3
- "version": "0.34.3",
3
+ "version": "0.34.5",
4
4
  "publishConfig": {
5
5
  "registry": "https://registry.npmjs.org/",
6
6
  "access": "public"
@@ -44,7 +44,7 @@
44
44
  "ts-morph": "^23.0.0",
45
45
  "tsx": "^4.16.2",
46
46
  "typescript": "^5.3.3",
47
- "@empiricalrun/llm": "^0.9.18",
47
+ "@empiricalrun/llm": "^0.9.20",
48
48
  "@empiricalrun/r2-uploader": "^0.3.6",
49
49
  "@empiricalrun/reporter": "^0.21.2"
50
50
  },