agentevals 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/trajectory/llm.cjs +18 -29
- package/dist/trajectory/llm.d.ts +2 -3
- package/dist/trajectory/llm.js +18 -29
- package/package.json +1 -1
package/dist/trajectory/llm.cjs
CHANGED
|
@@ -12,16 +12,20 @@ Your task is to grade the accuracy of an AI agent's internal trajectory.
|
|
|
12
12
|
- Makes logical sense between steps
|
|
13
13
|
- Shows clear progression
|
|
14
14
|
- Is relatively efficient, though it does not need to be perfectly efficient
|
|
15
|
-
- Is semantically equivalent to the provided reference trajectory
|
|
15
|
+
- Is semantically equivalent to the provided reference trajectory
|
|
16
16
|
</Rubric>
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
Based on the following reference trajectory:
|
|
19
|
+
|
|
20
|
+
<reference_trajectory>
|
|
21
|
+
{reference_outputs}
|
|
22
|
+
</reference_trajectory>
|
|
23
|
+
|
|
24
|
+
Grade this actual trajectory:
|
|
19
25
|
|
|
20
26
|
<trajectory>
|
|
21
27
|
{outputs}
|
|
22
28
|
</trajectory>
|
|
23
|
-
{inputs}
|
|
24
|
-
{reference_outputs}
|
|
25
29
|
`;
|
|
26
30
|
exports.TRAJECTORY_ACCURACY_PROMPT = `You are an expert data labeler.
|
|
27
31
|
Your task is to grade the accuracy of an AI agent's internal trajectory.
|
|
@@ -42,27 +46,16 @@ Grade the following trajectory:
|
|
|
42
46
|
|
|
43
47
|
<trajectory>
|
|
44
48
|
{outputs}
|
|
45
|
-
</trajectory
|
|
46
|
-
{inputs}
|
|
47
|
-
`;
|
|
49
|
+
</trajectory>`;
|
|
48
50
|
function _formatInputs(params) {
|
|
49
|
-
const {
|
|
51
|
+
const { outputs, referenceOutputs } = params;
|
|
50
52
|
const normalizedOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
|
|
51
53
|
const normalizedReferenceOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs ?? []);
|
|
52
54
|
const formattedReferenceOutputs = normalizedReferenceOutputs
|
|
53
|
-
?
|
|
55
|
+
? (0, utils_js_2._chatCompletionMessagesToString)(normalizedReferenceOutputs)
|
|
54
56
|
: "";
|
|
55
|
-
const
|
|
56
|
-
|
|
57
|
-
: "";
|
|
58
|
-
const formattedOutputs = typeof outputs === "object" && !Array.isArray(outputs)
|
|
59
|
-
? outputs
|
|
60
|
-
: (0, utils_js_2._chatCompletionMessagesToString)(normalizedOutputs);
|
|
61
|
-
return [
|
|
62
|
-
formattedOutputs,
|
|
63
|
-
formattedReferenceOutputs,
|
|
64
|
-
formattedInputs,
|
|
65
|
-
];
|
|
57
|
+
const formattedOutputs = (0, utils_js_2._chatCompletionMessagesToString)(normalizedOutputs);
|
|
58
|
+
return [formattedOutputs, formattedReferenceOutputs];
|
|
66
59
|
}
|
|
67
60
|
/**
|
|
68
61
|
* Creates an evaluator that uses an LLM to judge agent trajectories.
|
|
@@ -97,18 +90,14 @@ const createTrajectoryLLMAsJudge = ({ prompt = exports.TRAJECTORY_ACCURACY_PROMP
|
|
|
97
90
|
fewShotExamples,
|
|
98
91
|
});
|
|
99
92
|
const wrappedEvaluator = async ({ inputs, outputs, referenceOutputs, ...extra }) => {
|
|
100
|
-
const [formattedOutputs, formattedReferenceOutputs
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
inputs ? JSON.stringify(inputs) : "",
|
|
105
|
-
(0, utils_js_2._chatCompletionMessagesToString)((0, utils_js_1._normalizeToOpenAIMessagesList)(outputs)),
|
|
106
|
-
(0, utils_js_2._chatCompletionMessagesToString)((0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs)),
|
|
107
|
-
];
|
|
93
|
+
const [formattedOutputs, formattedReferenceOutputs] = _formatInputs({
|
|
94
|
+
outputs,
|
|
95
|
+
referenceOutputs,
|
|
96
|
+
});
|
|
108
97
|
return (0, utils_js_1._runEvaluator)(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, {
|
|
98
|
+
inputs,
|
|
109
99
|
outputs: formattedOutputs,
|
|
110
100
|
referenceOutputs: formattedReferenceOutputs,
|
|
111
|
-
inputs: formattedInputs,
|
|
112
101
|
...extra,
|
|
113
102
|
});
|
|
114
103
|
};
|
package/dist/trajectory/llm.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { BaseMessage } from "@langchain/core/messages";
|
|
2
2
|
import { ChatCompletionMessage, EvaluatorResult, TrajectoryLLMAsJudgeParams } from "../types.js";
|
|
3
|
-
export declare const TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n An accurate trajectory:\n - Makes logical sense between steps\n - Shows clear progression\n - Is relatively efficient, though it does not need to be perfectly efficient\n - Is semantically equivalent to the provided reference trajectory
|
|
4
|
-
export declare const TRAJECTORY_ACCURACY_PROMPT = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n An accurate trajectory:\n - Makes logical sense between steps\n - Shows clear progression\n - Is relatively efficient, though it does not need to be perfectly efficient\n</Rubric>\n\nFirst, try to understand the goal of the trajectory by looking at the input\n(if the input is not present try to infer it from the content of the first message),\nas well as the output of the final message. Once you understand the goal, grade the trajectory\nas it relates to achieving that goal.\n\nGrade the following trajectory:\n\n<trajectory>\n{outputs}\n</trajectory
|
|
3
|
+
export declare const TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n An accurate trajectory:\n - Makes logical sense between steps\n - Shows clear progression\n - Is relatively efficient, though it does not need to be perfectly efficient\n - Is semantically equivalent to the provided reference trajectory\n</Rubric>\n\nBased on the following reference trajectory:\n\n<reference_trajectory>\n{reference_outputs}\n</reference_trajectory>\n\nGrade this actual trajectory:\n\n<trajectory>\n{outputs}\n</trajectory>\n";
|
|
4
|
+
export declare const TRAJECTORY_ACCURACY_PROMPT = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n An accurate trajectory:\n - Makes logical sense between steps\n - Shows clear progression\n - Is relatively efficient, though it does not need to be perfectly efficient\n</Rubric>\n\nFirst, try to understand the goal of the trajectory by looking at the input\n(if the input is not present try to infer it from the content of the first message),\nas well as the output of the final message. Once you understand the goal, grade the trajectory\nas it relates to achieving that goal.\n\nGrade the following trajectory:\n\n<trajectory>\n{outputs}\n</trajectory>";
|
|
5
5
|
/**
|
|
6
6
|
* Creates an evaluator that uses an LLM to judge agent trajectories.
|
|
7
7
|
*
|
|
@@ -25,7 +25,6 @@ export declare const TRAJECTORY_ACCURACY_PROMPT = "You are an expert data labele
|
|
|
25
25
|
*/
|
|
26
26
|
export declare const createTrajectoryLLMAsJudge: ({ prompt, feedbackKey, model, system, judge, continuous, choices, useReasoning, fewShotExamples, }: TrajectoryLLMAsJudgeParams) => ({ inputs, outputs, referenceOutputs, ...extra }: {
|
|
27
27
|
[key: string]: unknown;
|
|
28
|
-
inputs?: Record<string, any> | undefined;
|
|
29
28
|
outputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
30
29
|
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
31
30
|
};
|
package/dist/trajectory/llm.js
CHANGED
|
@@ -9,16 +9,20 @@ Your task is to grade the accuracy of an AI agent's internal trajectory.
|
|
|
9
9
|
- Makes logical sense between steps
|
|
10
10
|
- Shows clear progression
|
|
11
11
|
- Is relatively efficient, though it does not need to be perfectly efficient
|
|
12
|
-
- Is semantically equivalent to the provided reference trajectory
|
|
12
|
+
- Is semantically equivalent to the provided reference trajectory
|
|
13
13
|
</Rubric>
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
Based on the following reference trajectory:
|
|
16
|
+
|
|
17
|
+
<reference_trajectory>
|
|
18
|
+
{reference_outputs}
|
|
19
|
+
</reference_trajectory>
|
|
20
|
+
|
|
21
|
+
Grade this actual trajectory:
|
|
16
22
|
|
|
17
23
|
<trajectory>
|
|
18
24
|
{outputs}
|
|
19
25
|
</trajectory>
|
|
20
|
-
{inputs}
|
|
21
|
-
{reference_outputs}
|
|
22
26
|
`;
|
|
23
27
|
export const TRAJECTORY_ACCURACY_PROMPT = `You are an expert data labeler.
|
|
24
28
|
Your task is to grade the accuracy of an AI agent's internal trajectory.
|
|
@@ -39,27 +43,16 @@ Grade the following trajectory:
|
|
|
39
43
|
|
|
40
44
|
<trajectory>
|
|
41
45
|
{outputs}
|
|
42
|
-
</trajectory
|
|
43
|
-
{inputs}
|
|
44
|
-
`;
|
|
46
|
+
</trajectory>`;
|
|
45
47
|
function _formatInputs(params) {
|
|
46
|
-
const {
|
|
48
|
+
const { outputs, referenceOutputs } = params;
|
|
47
49
|
const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs);
|
|
48
50
|
const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList(referenceOutputs ?? []);
|
|
49
51
|
const formattedReferenceOutputs = normalizedReferenceOutputs
|
|
50
|
-
?
|
|
52
|
+
? _chatCompletionMessagesToString(normalizedReferenceOutputs)
|
|
51
53
|
: "";
|
|
52
|
-
const
|
|
53
|
-
|
|
54
|
-
: "";
|
|
55
|
-
const formattedOutputs = typeof outputs === "object" && !Array.isArray(outputs)
|
|
56
|
-
? outputs
|
|
57
|
-
: _chatCompletionMessagesToString(normalizedOutputs);
|
|
58
|
-
return [
|
|
59
|
-
formattedOutputs,
|
|
60
|
-
formattedReferenceOutputs,
|
|
61
|
-
formattedInputs,
|
|
62
|
-
];
|
|
54
|
+
const formattedOutputs = _chatCompletionMessagesToString(normalizedOutputs);
|
|
55
|
+
return [formattedOutputs, formattedReferenceOutputs];
|
|
63
56
|
}
|
|
64
57
|
/**
|
|
65
58
|
* Creates an evaluator that uses an LLM to judge agent trajectories.
|
|
@@ -94,18 +87,14 @@ export const createTrajectoryLLMAsJudge = ({ prompt = TRAJECTORY_ACCURACY_PROMPT
|
|
|
94
87
|
fewShotExamples,
|
|
95
88
|
});
|
|
96
89
|
const wrappedEvaluator = async ({ inputs, outputs, referenceOutputs, ...extra }) => {
|
|
97
|
-
const [formattedOutputs, formattedReferenceOutputs
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
inputs ? JSON.stringify(inputs) : "",
|
|
102
|
-
_chatCompletionMessagesToString(_normalizeToOpenAIMessagesList(outputs)),
|
|
103
|
-
_chatCompletionMessagesToString(_normalizeToOpenAIMessagesList(referenceOutputs)),
|
|
104
|
-
];
|
|
90
|
+
const [formattedOutputs, formattedReferenceOutputs] = _formatInputs({
|
|
91
|
+
outputs,
|
|
92
|
+
referenceOutputs,
|
|
93
|
+
});
|
|
105
94
|
return _runEvaluator(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, {
|
|
95
|
+
inputs,
|
|
106
96
|
outputs: formattedOutputs,
|
|
107
97
|
referenceOutputs: formattedReferenceOutputs,
|
|
108
|
-
inputs: formattedInputs,
|
|
109
98
|
...extra,
|
|
110
99
|
});
|
|
111
100
|
};
|