agentevals 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,16 +12,20 @@ Your task is to grade the accuracy of an AI agent's internal trajectory.
12
12
  - Makes logical sense between steps
13
13
  - Shows clear progression
14
14
  - Is relatively efficient, though it does not need to be perfectly efficient
15
- - Is semantically equivalent to the provided reference trajectory, if present
15
+ - Is semantically equivalent to the provided reference trajectory
16
16
  </Rubric>
17
17
 
18
- Grade the following trajectory:
18
+ Based on the following reference trajectory:
19
+
20
+ <reference_trajectory>
21
+ {reference_outputs}
22
+ </reference_trajectory>
23
+
24
+ Grade this actual trajectory:
19
25
 
20
26
  <trajectory>
21
27
  {outputs}
22
28
  </trajectory>
23
- {inputs}
24
- {reference_outputs}
25
29
  `;
26
30
  exports.TRAJECTORY_ACCURACY_PROMPT = `You are an expert data labeler.
27
31
  Your task is to grade the accuracy of an AI agent's internal trajectory.
@@ -42,27 +46,16 @@ Grade the following trajectory:
42
46
 
43
47
  <trajectory>
44
48
  {outputs}
45
- </trajectory>
46
- {inputs}
47
- `;
49
+ </trajectory>`;
48
50
  function _formatInputs(params) {
49
- const { inputs, outputs, referenceOutputs } = params;
51
+ const { outputs, referenceOutputs } = params;
50
52
  const normalizedOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
51
53
  const normalizedReferenceOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs ?? []);
52
54
  const formattedReferenceOutputs = normalizedReferenceOutputs
53
- ? `\nUse the following trajectory as an example reference when grading:\n<reference_trajectory>\n${(0, utils_js_2._chatCompletionMessagesToString)(normalizedReferenceOutputs)}\n</reference_trajectory>\n`
55
+ ? (0, utils_js_2._chatCompletionMessagesToString)(normalizedReferenceOutputs)
54
56
  : "";
55
- const formattedInputs = inputs
56
- ? `\nThe agent generated the trajectory from the following input:\n<input>\n${JSON.stringify(inputs)}\n</input>\n`
57
- : "";
58
- const formattedOutputs = typeof outputs === "object" && !Array.isArray(outputs)
59
- ? outputs
60
- : (0, utils_js_2._chatCompletionMessagesToString)(normalizedOutputs);
61
- return [
62
- formattedOutputs,
63
- formattedReferenceOutputs,
64
- formattedInputs,
65
- ];
57
+ const formattedOutputs = (0, utils_js_2._chatCompletionMessagesToString)(normalizedOutputs);
58
+ return [formattedOutputs, formattedReferenceOutputs];
66
59
  }
67
60
  /**
68
61
  * Creates an evaluator that uses an LLM to judge agent trajectories.
@@ -97,18 +90,14 @@ const createTrajectoryLLMAsJudge = ({ prompt = exports.TRAJECTORY_ACCURACY_PROMP
97
90
  fewShotExamples,
98
91
  });
99
92
  const wrappedEvaluator = async ({ inputs, outputs, referenceOutputs, ...extra }) => {
100
- const [formattedOutputs, formattedReferenceOutputs, formattedInputs] = prompt === exports.TRAJECTORY_ACCURACY_PROMPT ||
101
- prompt === exports.TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE
102
- ? _formatInputs({ inputs, outputs, referenceOutputs })
103
- : [
104
- inputs ? JSON.stringify(inputs) : "",
105
- (0, utils_js_2._chatCompletionMessagesToString)((0, utils_js_1._normalizeToOpenAIMessagesList)(outputs)),
106
- (0, utils_js_2._chatCompletionMessagesToString)((0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs)),
107
- ];
93
+ const [formattedOutputs, formattedReferenceOutputs] = _formatInputs({
94
+ outputs,
95
+ referenceOutputs,
96
+ });
108
97
  return (0, utils_js_1._runEvaluator)(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, {
98
+ inputs,
109
99
  outputs: formattedOutputs,
110
100
  referenceOutputs: formattedReferenceOutputs,
111
- inputs: formattedInputs,
112
101
  ...extra,
113
102
  });
114
103
  };
@@ -1,7 +1,7 @@
1
1
  import { BaseMessage } from "@langchain/core/messages";
2
2
  import { ChatCompletionMessage, EvaluatorResult, TrajectoryLLMAsJudgeParams } from "../types.js";
3
- export declare const TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n An accurate trajectory:\n - Makes logical sense between steps\n - Shows clear progression\n - Is relatively efficient, though it does not need to be perfectly efficient\n - Is semantically equivalent to the provided reference trajectory, if present\n</Rubric>\n\nGrade the following trajectory:\n\n<trajectory>\n{outputs}\n</trajectory>\n{inputs}\n{reference_outputs}\n";
4
- export declare const TRAJECTORY_ACCURACY_PROMPT = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n An accurate trajectory:\n - Makes logical sense between steps\n - Shows clear progression\n - Is relatively efficient, though it does not need to be perfectly efficient\n</Rubric>\n\nFirst, try to understand the goal of the trajectory by looking at the input\n(if the input is not present try to infer it from the content of the first message),\nas well as the output of the final message. Once you understand the goal, grade the trajectory\nas it relates to achieving that goal.\n\nGrade the following trajectory:\n\n<trajectory>\n{outputs}\n</trajectory>\n{inputs}\n";
3
+ export declare const TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n An accurate trajectory:\n - Makes logical sense between steps\n - Shows clear progression\n - Is relatively efficient, though it does not need to be perfectly efficient\n - Is semantically equivalent to the provided reference trajectory\n</Rubric>\n\nBased on the following reference trajectory:\n\n<reference_trajectory>\n{reference_outputs}\n</reference_trajectory>\n\nGrade this actual trajectory:\n\n<trajectory>\n{outputs}\n</trajectory>\n";
4
+ export declare const TRAJECTORY_ACCURACY_PROMPT = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n An accurate trajectory:\n - Makes logical sense between steps\n - Shows clear progression\n - Is relatively efficient, though it does not need to be perfectly efficient\n</Rubric>\n\nFirst, try to understand the goal of the trajectory by looking at the input\n(if the input is not present try to infer it from the content of the first message),\nas well as the output of the final message. Once you understand the goal, grade the trajectory\nas it relates to achieving that goal.\n\nGrade the following trajectory:\n\n<trajectory>\n{outputs}\n</trajectory>";
5
5
  /**
6
6
  * Creates an evaluator that uses an LLM to judge agent trajectories.
7
7
  *
@@ -25,7 +25,6 @@ export declare const TRAJECTORY_ACCURACY_PROMPT = "You are an expert data labele
25
25
  */
26
26
  export declare const createTrajectoryLLMAsJudge: ({ prompt, feedbackKey, model, system, judge, continuous, choices, useReasoning, fewShotExamples, }: TrajectoryLLMAsJudgeParams) => ({ inputs, outputs, referenceOutputs, ...extra }: {
27
27
  [key: string]: unknown;
28
- inputs?: Record<string, any> | undefined;
29
28
  outputs: ChatCompletionMessage[] | BaseMessage[] | {
30
29
  messages: (BaseMessage | ChatCompletionMessage)[];
31
30
  };
@@ -9,16 +9,20 @@ Your task is to grade the accuracy of an AI agent's internal trajectory.
9
9
  - Makes logical sense between steps
10
10
  - Shows clear progression
11
11
  - Is relatively efficient, though it does not need to be perfectly efficient
12
- - Is semantically equivalent to the provided reference trajectory, if present
12
+ - Is semantically equivalent to the provided reference trajectory
13
13
  </Rubric>
14
14
 
15
- Grade the following trajectory:
15
+ Based on the following reference trajectory:
16
+
17
+ <reference_trajectory>
18
+ {reference_outputs}
19
+ </reference_trajectory>
20
+
21
+ Grade this actual trajectory:
16
22
 
17
23
  <trajectory>
18
24
  {outputs}
19
25
  </trajectory>
20
- {inputs}
21
- {reference_outputs}
22
26
  `;
23
27
  export const TRAJECTORY_ACCURACY_PROMPT = `You are an expert data labeler.
24
28
  Your task is to grade the accuracy of an AI agent's internal trajectory.
@@ -39,27 +43,16 @@ Grade the following trajectory:
39
43
 
40
44
  <trajectory>
41
45
  {outputs}
42
- </trajectory>
43
- {inputs}
44
- `;
46
+ </trajectory>`;
45
47
  function _formatInputs(params) {
46
- const { inputs, outputs, referenceOutputs } = params;
48
+ const { outputs, referenceOutputs } = params;
47
49
  const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs);
48
50
  const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList(referenceOutputs ?? []);
49
51
  const formattedReferenceOutputs = normalizedReferenceOutputs
50
- ? `\nUse the following trajectory as an example reference when grading:\n<reference_trajectory>\n${_chatCompletionMessagesToString(normalizedReferenceOutputs)}\n</reference_trajectory>\n`
52
+ ? _chatCompletionMessagesToString(normalizedReferenceOutputs)
51
53
  : "";
52
- const formattedInputs = inputs
53
- ? `\nThe agent generated the trajectory from the following input:\n<input>\n${JSON.stringify(inputs)}\n</input>\n`
54
- : "";
55
- const formattedOutputs = typeof outputs === "object" && !Array.isArray(outputs)
56
- ? outputs
57
- : _chatCompletionMessagesToString(normalizedOutputs);
58
- return [
59
- formattedOutputs,
60
- formattedReferenceOutputs,
61
- formattedInputs,
62
- ];
54
+ const formattedOutputs = _chatCompletionMessagesToString(normalizedOutputs);
55
+ return [formattedOutputs, formattedReferenceOutputs];
63
56
  }
64
57
  /**
65
58
  * Creates an evaluator that uses an LLM to judge agent trajectories.
@@ -94,18 +87,14 @@ export const createTrajectoryLLMAsJudge = ({ prompt = TRAJECTORY_ACCURACY_PROMPT
94
87
  fewShotExamples,
95
88
  });
96
89
  const wrappedEvaluator = async ({ inputs, outputs, referenceOutputs, ...extra }) => {
97
- const [formattedOutputs, formattedReferenceOutputs, formattedInputs] = prompt === TRAJECTORY_ACCURACY_PROMPT ||
98
- prompt === TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE
99
- ? _formatInputs({ inputs, outputs, referenceOutputs })
100
- : [
101
- inputs ? JSON.stringify(inputs) : "",
102
- _chatCompletionMessagesToString(_normalizeToOpenAIMessagesList(outputs)),
103
- _chatCompletionMessagesToString(_normalizeToOpenAIMessagesList(referenceOutputs)),
104
- ];
90
+ const [formattedOutputs, formattedReferenceOutputs] = _formatInputs({
91
+ outputs,
92
+ referenceOutputs,
93
+ });
105
94
  return _runEvaluator(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, {
95
+ inputs,
106
96
  outputs: formattedOutputs,
107
97
  referenceOutputs: formattedReferenceOutputs,
108
- inputs: formattedInputs,
109
98
  ...extra,
110
99
  });
111
100
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentevals",
3
- "version": "0.0.2",
3
+ "version": "0.0.3",
4
4
  "packageManager": "yarn@3.5.1",
5
5
  "type": "module",
6
6
  "scripts": {