agentevals 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +19 -0
- package/README.md +1 -0
- package/dist/evaluators/exact.cjs +23 -0
- package/dist/evaluators/exact.d.ts +10 -0
- package/dist/evaluators/exact.js +19 -0
- package/dist/evaluators/llm.cjs +284 -0
- package/dist/evaluators/llm.d.ts +73 -0
- package/dist/evaluators/llm.js +279 -0
- package/dist/evaluators/prompts/conciseness.cjs +42 -0
- package/dist/evaluators/prompts/conciseness.d.ts +1 -0
- package/dist/evaluators/prompts/conciseness.js +39 -0
- package/dist/evaluators/prompts/correctness.cjs +46 -0
- package/dist/evaluators/prompts/correctness.d.ts +1 -0
- package/dist/evaluators/prompts/correctness.js +43 -0
- package/dist/evaluators/prompts/hallucination.cjs +46 -0
- package/dist/evaluators/prompts/hallucination.d.ts +1 -0
- package/dist/evaluators/prompts/hallucination.js +43 -0
- package/dist/evaluators/string/embedding_similarity.cjs +49 -0
- package/dist/evaluators/string/embedding_similarity.d.ts +18 -0
- package/dist/evaluators/string/embedding_similarity.js +45 -0
- package/dist/evaluators/string/levenshtein.cjs +57 -0
- package/dist/evaluators/string/levenshtein.d.ts +11 -0
- package/dist/evaluators/string/levenshtein.js +53 -0
- package/dist/evaluators/trajectory/llm.cjs +86 -0
- package/dist/evaluators/trajectory/llm.d.ts +49 -0
- package/dist/evaluators/trajectory/llm.js +82 -0
- package/dist/evaluators/trajectory/strict.cjs +58 -0
- package/dist/evaluators/trajectory/strict.d.ts +10 -0
- package/dist/evaluators/trajectory/strict.js +54 -0
- package/dist/evaluators/trajectory/subset.cjs +32 -0
- package/dist/evaluators/trajectory/subset.d.ts +23 -0
- package/dist/evaluators/trajectory/subset.js +28 -0
- package/dist/evaluators/trajectory/superset.cjs +32 -0
- package/dist/evaluators/trajectory/superset.d.ts +23 -0
- package/dist/evaluators/trajectory/superset.js +28 -0
- package/dist/evaluators/trajectory/unordered.cjs +33 -0
- package/dist/evaluators/trajectory/unordered.d.ts +23 -0
- package/dist/evaluators/trajectory/unordered.js +29 -0
- package/dist/evaluators/trajectory/utils.cjs +68 -0
- package/dist/evaluators/trajectory/utils.d.ts +3 -0
- package/dist/evaluators/trajectory/utils.js +63 -0
- package/dist/evaluators/types.cjs +2 -0
- package/dist/evaluators/types.d.ts +44 -0
- package/dist/evaluators/types.js +1 -0
- package/dist/evaluators/utils.cjs +85 -0
- package/dist/evaluators/utils.d.ts +13 -0
- package/dist/evaluators/utils.js +78 -0
- package/dist/index.cjs +43 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.js +13 -0
- package/index.cjs +1 -0
- package/index.d.cts +1 -0
- package/index.d.ts +1 -0
- package/index.js +1 -0
- package/package.json +60 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.levenshteinDistance = void 0;
|
|
4
|
+
const utils_js_1 = require("../utils.cjs");
|
|
5
|
+
function scorer(outputs, referenceOutputs) {
|
|
6
|
+
if (outputs === null || referenceOutputs === null) {
|
|
7
|
+
throw new Error("Levenshtein distance requires both outputs and reference_outputs");
|
|
8
|
+
}
|
|
9
|
+
// Convert to strings if needed
|
|
10
|
+
const outputStr = typeof outputs === "string" ? outputs : JSON.stringify(outputs);
|
|
11
|
+
const referenceStr = typeof referenceOutputs === "string"
|
|
12
|
+
? referenceOutputs
|
|
13
|
+
: JSON.stringify(referenceOutputs);
|
|
14
|
+
// Create a matrix of size (m+1)x(n+1) where m and n are the string lengths
|
|
15
|
+
const m = outputStr.length;
|
|
16
|
+
const n = referenceStr.length;
|
|
17
|
+
const dp = Array(m + 1)
|
|
18
|
+
.fill(null)
|
|
19
|
+
.map(() => Array(n + 1).fill(0));
|
|
20
|
+
// Initialize first row and column
|
|
21
|
+
for (let i = 0; i <= m; i++) {
|
|
22
|
+
dp[i][0] = i;
|
|
23
|
+
}
|
|
24
|
+
for (let j = 0; j <= n; j++) {
|
|
25
|
+
dp[0][j] = j;
|
|
26
|
+
}
|
|
27
|
+
// Fill the matrix
|
|
28
|
+
for (let i = 1; i <= m; i++) {
|
|
29
|
+
for (let j = 1; j <= n; j++) {
|
|
30
|
+
if (outputStr[i - 1] === referenceStr[j - 1]) {
|
|
31
|
+
dp[i][j] = dp[i - 1][j - 1];
|
|
32
|
+
}
|
|
33
|
+
else {
|
|
34
|
+
dp[i][j] = Math.min(dp[i - 1][j] + 1, // deletion
|
|
35
|
+
dp[i][j - 1] + 1, // insertion
|
|
36
|
+
dp[i - 1][j - 1] + 1 // substitution
|
|
37
|
+
);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
// Calculate the distance and normalize it to a score between 0 and 1
|
|
42
|
+
const distance = dp[m][n];
|
|
43
|
+
const maxLength = Math.max(m, n);
|
|
44
|
+
return maxLength > 0 ? 1.0 - distance / maxLength : 1.0;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Evaluates the actual output and reference output for similarity by Levenshtein distance.
|
|
48
|
+
* @param options - The options object containing outputs and reference outputs
|
|
49
|
+
* @returns EvaluatorResult containing match result with score between 0.0 and 1.0,
|
|
50
|
+
* where 1.0 indicates an exact match and lower values indicate greater differences
|
|
51
|
+
*/
|
|
52
|
+
async function levenshteinDistance(params) {
|
|
53
|
+
const { outputs, referenceOutputs } = params;
|
|
54
|
+
const getScore = () => scorer(outputs, referenceOutputs);
|
|
55
|
+
return (0, utils_js_1._runEvaluator)("levenshtein_distance", getScore, "levenshtein_distance");
|
|
56
|
+
}
|
|
57
|
+
exports.levenshteinDistance = levenshteinDistance;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { EvaluatorResult } from "../types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Evaluates the actual output and reference output for similarity by Levenshtein distance.
|
|
4
|
+
* @param options - The options object containing outputs and reference outputs
|
|
5
|
+
* @returns EvaluatorResult containing match result with score between 0.0 and 1.0,
|
|
6
|
+
* where 1.0 indicates an exact match and lower values indicate greater differences
|
|
7
|
+
*/
|
|
8
|
+
export declare function levenshteinDistance(params: {
|
|
9
|
+
outputs: unknown;
|
|
10
|
+
referenceOutputs: unknown;
|
|
11
|
+
}): Promise<EvaluatorResult>;
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { _runEvaluator } from "../utils.js";
|
|
2
|
+
function scorer(outputs, referenceOutputs) {
|
|
3
|
+
if (outputs === null || referenceOutputs === null) {
|
|
4
|
+
throw new Error("Levenshtein distance requires both outputs and reference_outputs");
|
|
5
|
+
}
|
|
6
|
+
// Convert to strings if needed
|
|
7
|
+
const outputStr = typeof outputs === "string" ? outputs : JSON.stringify(outputs);
|
|
8
|
+
const referenceStr = typeof referenceOutputs === "string"
|
|
9
|
+
? referenceOutputs
|
|
10
|
+
: JSON.stringify(referenceOutputs);
|
|
11
|
+
// Create a matrix of size (m+1)x(n+1) where m and n are the string lengths
|
|
12
|
+
const m = outputStr.length;
|
|
13
|
+
const n = referenceStr.length;
|
|
14
|
+
const dp = Array(m + 1)
|
|
15
|
+
.fill(null)
|
|
16
|
+
.map(() => Array(n + 1).fill(0));
|
|
17
|
+
// Initialize first row and column
|
|
18
|
+
for (let i = 0; i <= m; i++) {
|
|
19
|
+
dp[i][0] = i;
|
|
20
|
+
}
|
|
21
|
+
for (let j = 0; j <= n; j++) {
|
|
22
|
+
dp[0][j] = j;
|
|
23
|
+
}
|
|
24
|
+
// Fill the matrix
|
|
25
|
+
for (let i = 1; i <= m; i++) {
|
|
26
|
+
for (let j = 1; j <= n; j++) {
|
|
27
|
+
if (outputStr[i - 1] === referenceStr[j - 1]) {
|
|
28
|
+
dp[i][j] = dp[i - 1][j - 1];
|
|
29
|
+
}
|
|
30
|
+
else {
|
|
31
|
+
dp[i][j] = Math.min(dp[i - 1][j] + 1, // deletion
|
|
32
|
+
dp[i][j - 1] + 1, // insertion
|
|
33
|
+
dp[i - 1][j - 1] + 1 // substitution
|
|
34
|
+
);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
// Calculate the distance and normalize it to a score between 0 and 1
|
|
39
|
+
const distance = dp[m][n];
|
|
40
|
+
const maxLength = Math.max(m, n);
|
|
41
|
+
return maxLength > 0 ? 1.0 - distance / maxLength : 1.0;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Evaluates the actual output and reference output for similarity by Levenshtein distance.
|
|
45
|
+
* @param options - The options object containing outputs and reference outputs
|
|
46
|
+
* @returns EvaluatorResult containing match result with score between 0.0 and 1.0,
|
|
47
|
+
* where 1.0 indicates an exact match and lower values indicate greater differences
|
|
48
|
+
*/
|
|
49
|
+
export async function levenshteinDistance(params) {
|
|
50
|
+
const { outputs, referenceOutputs } = params;
|
|
51
|
+
const getScore = () => scorer(outputs, referenceOutputs);
|
|
52
|
+
return _runEvaluator("levenshtein_distance", getScore, "levenshtein_distance");
|
|
53
|
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.createTrajectoryLLMAsJudge = exports.DEFAULT_PROMPT = void 0;
|
|
4
|
+
const llm_js_1 = require("../llm.cjs");
|
|
5
|
+
const utils_js_1 = require("../utils.cjs");
|
|
6
|
+
const utils_js_2 = require("../utils.cjs");
|
|
7
|
+
const utils_js_3 = require("./utils.cjs");
|
|
8
|
+
exports.DEFAULT_PROMPT = `Grade the following agent trajectory:
|
|
9
|
+
|
|
10
|
+
<trajectory>
|
|
11
|
+
{outputs}
|
|
12
|
+
</trajectory>
|
|
13
|
+
{inputs}
|
|
14
|
+
{reference_outputs}
|
|
15
|
+
{rubric}
|
|
16
|
+
`;
|
|
17
|
+
function _formatInputs(params) {
|
|
18
|
+
const { inputs, outputs, referenceOutputs, rubric } = params;
|
|
19
|
+
const normalizedOutputs = (0, utils_js_2._normalizeToOpenAIMessagesList)(outputs);
|
|
20
|
+
const normalizedReferenceOutputs = (0, utils_js_2._normalizeToOpenAIMessagesList)(referenceOutputs ?? []);
|
|
21
|
+
const formattedReferenceOutputs = normalizedReferenceOutputs
|
|
22
|
+
? `\nUse the following trajectory as an example reference when grading:\n<reference_trajectory>\n${(0, utils_js_3._chatCompletionMessagesToString)(normalizedReferenceOutputs)}\n</reference_trajectory>\n`
|
|
23
|
+
: "";
|
|
24
|
+
const formattedInputs = inputs
|
|
25
|
+
? `\nThe agent generated the trajectory from the following input:\n<input>\n${JSON.stringify(inputs)}\n</input>\n`
|
|
26
|
+
: "";
|
|
27
|
+
const formattedOutputs = typeof outputs === "object" && !Array.isArray(outputs)
|
|
28
|
+
? outputs
|
|
29
|
+
: (0, utils_js_3._chatCompletionMessagesToString)(normalizedOutputs);
|
|
30
|
+
const formattedRubric = rubric
|
|
31
|
+
? `\nGrade the agent trajectory along the following rubric:\n<rubric>\n${rubric}\n</rubric>\n`
|
|
32
|
+
: "";
|
|
33
|
+
return [
|
|
34
|
+
formattedOutputs,
|
|
35
|
+
formattedReferenceOutputs,
|
|
36
|
+
formattedInputs,
|
|
37
|
+
formattedRubric,
|
|
38
|
+
];
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Creates an evaluator that uses an LLM to judge agent trajectories.
|
|
42
|
+
*
|
|
43
|
+
* @param options - Configuration options
|
|
44
|
+
* @param options.prompt - The evaluation prompt. Can be a string template, LangChain prompt template,
|
|
45
|
+
* or callable that returns a list of chat messages. Note that the default prompt
|
|
46
|
+
* allows a rubric in addition to the typical "inputs", "outputs", and
|
|
47
|
+
* "reference_outputs" parameters.
|
|
48
|
+
* @param options.feedbackKey - Key used to store the evaluation result. Defaults to "trajectory_accuracy".
|
|
49
|
+
* @param options.model - Model identifier to use. Defaults to "openai:o3-mini". If judge is an OpenAI client,
|
|
50
|
+
* this should be a model name directly. If judge is omitted, must be a valid
|
|
51
|
+
* LangChain model identifier.
|
|
52
|
+
* @param options.system - Optional system message to prepend to the prompt.
|
|
53
|
+
* @param options.judge - The LLM used for evaluation. Can be an OpenAI client or a LangChainLikeModel.
|
|
54
|
+
* If an OpenAI client, must specify "model" as well. If omitted, "model" will be
|
|
55
|
+
* used to instantiate a LangChain model instance by model string.
|
|
56
|
+
* @param options.continuous - If true, score will be a float between 0 and 1. If false, score will be boolean.
|
|
57
|
+
* Defaults to false.
|
|
58
|
+
* @param options.choices - Optional list of specific float values the score must be chosen from.
|
|
59
|
+
* @param options.useReasoning - If true, includes explanation for the score in the output. Defaults to true.
|
|
60
|
+
* @param options.fewShotExamples - Optional list of example evaluations to append to the prompt.
|
|
61
|
+
* @returns A function that evaluates agent trajectories using the configured LLM judge.
|
|
62
|
+
*/
|
|
63
|
+
const createTrajectoryLLMAsJudge = ({ prompt = exports.DEFAULT_PROMPT, feedbackKey = "trajectory_accuracy", model = "openai:o3-mini", system, judge, continuous = false, choices, useReasoning = true, fewShotExamples, }) => {
|
|
64
|
+
const scorer = (0, llm_js_1._createLLMAsJudgeScorer)({
|
|
65
|
+
prompt,
|
|
66
|
+
judge,
|
|
67
|
+
model,
|
|
68
|
+
system,
|
|
69
|
+
continuous,
|
|
70
|
+
choices,
|
|
71
|
+
useReasoning,
|
|
72
|
+
fewShotExamples,
|
|
73
|
+
});
|
|
74
|
+
const wrappedEvaluator = async ({ inputs, outputs, referenceOutputs, rubric, ...extra }) => {
|
|
75
|
+
const [formattedOutputs, formattedReferenceOutputs, formattedInputs, formattedRubric,] = _formatInputs({ inputs, outputs, referenceOutputs, rubric });
|
|
76
|
+
return (0, utils_js_1._runEvaluator)(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, {
|
|
77
|
+
outputs: formattedOutputs,
|
|
78
|
+
referenceOutputs: formattedReferenceOutputs,
|
|
79
|
+
inputs: formattedInputs,
|
|
80
|
+
rubric: formattedRubric,
|
|
81
|
+
...extra,
|
|
82
|
+
});
|
|
83
|
+
};
|
|
84
|
+
return wrappedEvaluator;
|
|
85
|
+
};
|
|
86
|
+
exports.createTrajectoryLLMAsJudge = createTrajectoryLLMAsJudge;
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import { ChatCompletionMessage, EvaluatorResult, FewShotExample, ModelClient } from "../types.js";
|
|
2
|
+
import { BaseMessage } from "@langchain/core/messages";
|
|
3
|
+
import { BaseChatModel } from "@langchain/core/language_models/chat_models";
|
|
4
|
+
import { RunnableInterface } from "@langchain/core/runnables";
|
|
5
|
+
export declare const DEFAULT_PROMPT = "Grade the following agent trajectory:\n\n<trajectory>\n{outputs}\n</trajectory>\n{inputs}\n{reference_outputs}\n{rubric}\n";
|
|
6
|
+
/**
|
|
7
|
+
* Creates an evaluator that uses an LLM to judge agent trajectories.
|
|
8
|
+
*
|
|
9
|
+
* @param options - Configuration options
|
|
10
|
+
* @param options.prompt - The evaluation prompt. Can be a string template, LangChain prompt template,
|
|
11
|
+
* or callable that returns a list of chat messages. Note that the default prompt
|
|
12
|
+
* allows a rubric in addition to the typical "inputs", "outputs", and
|
|
13
|
+
* "reference_outputs" parameters.
|
|
14
|
+
* @param options.feedbackKey - Key used to store the evaluation result. Defaults to "trajectory_accuracy".
|
|
15
|
+
* @param options.model - Model identifier to use. Defaults to "openai:o3-mini". If judge is an OpenAI client,
|
|
16
|
+
* this should be a model name directly. If judge is omitted, must be a valid
|
|
17
|
+
* LangChain model identifier.
|
|
18
|
+
* @param options.system - Optional system message to prepend to the prompt.
|
|
19
|
+
* @param options.judge - The LLM used for evaluation. Can be an OpenAI client or a LangChainLikeModel.
|
|
20
|
+
* If an OpenAI client, must specify "model" as well. If omitted, "model" will be
|
|
21
|
+
* used to instantiate a LangChain model instance by model string.
|
|
22
|
+
* @param options.continuous - If true, score will be a float between 0 and 1. If false, score will be boolean.
|
|
23
|
+
* Defaults to false.
|
|
24
|
+
* @param options.choices - Optional list of specific float values the score must be chosen from.
|
|
25
|
+
* @param options.useReasoning - If true, includes explanation for the score in the output. Defaults to true.
|
|
26
|
+
* @param options.fewShotExamples - Optional list of example evaluations to append to the prompt.
|
|
27
|
+
* @returns A function that evaluates agent trajectories using the configured LLM judge.
|
|
28
|
+
*/
|
|
29
|
+
export declare const createTrajectoryLLMAsJudge: ({ prompt, feedbackKey, model, system, judge, continuous, choices, useReasoning, fewShotExamples, }: {
|
|
30
|
+
prompt?: string | RunnableInterface<any, any, import("@langchain/core/runnables").RunnableConfig<Record<string, any>>> | ((...args: unknown[]) => ChatCompletionMessage[] | Promise<ChatCompletionMessage[]>) | undefined;
|
|
31
|
+
feedbackKey?: string | undefined;
|
|
32
|
+
model?: string | undefined;
|
|
33
|
+
system?: string | undefined;
|
|
34
|
+
judge?: ModelClient | BaseChatModel<import("@langchain/core/language_models/chat_models").BaseChatModelCallOptions, import("@langchain/core/messages").AIMessageChunk> | undefined;
|
|
35
|
+
continuous?: boolean | undefined;
|
|
36
|
+
choices?: number[] | undefined;
|
|
37
|
+
useReasoning?: boolean | undefined;
|
|
38
|
+
fewShotExamples?: FewShotExample[] | undefined;
|
|
39
|
+
}) => ({ inputs, outputs, referenceOutputs, rubric, ...extra }: {
|
|
40
|
+
[key: string]: unknown;
|
|
41
|
+
inputs?: Record<string, any> | undefined;
|
|
42
|
+
outputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
43
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
44
|
+
};
|
|
45
|
+
referenceOutputs?: BaseMessage[] | ChatCompletionMessage[] | {
|
|
46
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
47
|
+
} | undefined;
|
|
48
|
+
rubric?: string | undefined;
|
|
49
|
+
}) => Promise<EvaluatorResult>;
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import { _createLLMAsJudgeScorer } from "../llm.js";
|
|
2
|
+
import { _runEvaluator } from "../utils.js";
|
|
3
|
+
import { _normalizeToOpenAIMessagesList } from "../utils.js";
|
|
4
|
+
import { _chatCompletionMessagesToString } from "./utils.js";
|
|
5
|
+
export const DEFAULT_PROMPT = `Grade the following agent trajectory:
|
|
6
|
+
|
|
7
|
+
<trajectory>
|
|
8
|
+
{outputs}
|
|
9
|
+
</trajectory>
|
|
10
|
+
{inputs}
|
|
11
|
+
{reference_outputs}
|
|
12
|
+
{rubric}
|
|
13
|
+
`;
|
|
14
|
+
function _formatInputs(params) {
|
|
15
|
+
const { inputs, outputs, referenceOutputs, rubric } = params;
|
|
16
|
+
const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs);
|
|
17
|
+
const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList(referenceOutputs ?? []);
|
|
18
|
+
const formattedReferenceOutputs = normalizedReferenceOutputs
|
|
19
|
+
? `\nUse the following trajectory as an example reference when grading:\n<reference_trajectory>\n${_chatCompletionMessagesToString(normalizedReferenceOutputs)}\n</reference_trajectory>\n`
|
|
20
|
+
: "";
|
|
21
|
+
const formattedInputs = inputs
|
|
22
|
+
? `\nThe agent generated the trajectory from the following input:\n<input>\n${JSON.stringify(inputs)}\n</input>\n`
|
|
23
|
+
: "";
|
|
24
|
+
const formattedOutputs = typeof outputs === "object" && !Array.isArray(outputs)
|
|
25
|
+
? outputs
|
|
26
|
+
: _chatCompletionMessagesToString(normalizedOutputs);
|
|
27
|
+
const formattedRubric = rubric
|
|
28
|
+
? `\nGrade the agent trajectory along the following rubric:\n<rubric>\n${rubric}\n</rubric>\n`
|
|
29
|
+
: "";
|
|
30
|
+
return [
|
|
31
|
+
formattedOutputs,
|
|
32
|
+
formattedReferenceOutputs,
|
|
33
|
+
formattedInputs,
|
|
34
|
+
formattedRubric,
|
|
35
|
+
];
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Creates an evaluator that uses an LLM to judge agent trajectories.
|
|
39
|
+
*
|
|
40
|
+
* @param options - Configuration options
|
|
41
|
+
* @param options.prompt - The evaluation prompt. Can be a string template, LangChain prompt template,
|
|
42
|
+
* or callable that returns a list of chat messages. Note that the default prompt
|
|
43
|
+
* allows a rubric in addition to the typical "inputs", "outputs", and
|
|
44
|
+
* "reference_outputs" parameters.
|
|
45
|
+
* @param options.feedbackKey - Key used to store the evaluation result. Defaults to "trajectory_accuracy".
|
|
46
|
+
* @param options.model - Model identifier to use. Defaults to "openai:o3-mini". If judge is an OpenAI client,
|
|
47
|
+
* this should be a model name directly. If judge is omitted, must be a valid
|
|
48
|
+
* LangChain model identifier.
|
|
49
|
+
* @param options.system - Optional system message to prepend to the prompt.
|
|
50
|
+
* @param options.judge - The LLM used for evaluation. Can be an OpenAI client or a LangChainLikeModel.
|
|
51
|
+
* If an OpenAI client, must specify "model" as well. If omitted, "model" will be
|
|
52
|
+
* used to instantiate a LangChain model instance by model string.
|
|
53
|
+
* @param options.continuous - If true, score will be a float between 0 and 1. If false, score will be boolean.
|
|
54
|
+
* Defaults to false.
|
|
55
|
+
* @param options.choices - Optional list of specific float values the score must be chosen from.
|
|
56
|
+
* @param options.useReasoning - If true, includes explanation for the score in the output. Defaults to true.
|
|
57
|
+
* @param options.fewShotExamples - Optional list of example evaluations to append to the prompt.
|
|
58
|
+
* @returns A function that evaluates agent trajectories using the configured LLM judge.
|
|
59
|
+
*/
|
|
60
|
+
export const createTrajectoryLLMAsJudge = ({ prompt = DEFAULT_PROMPT, feedbackKey = "trajectory_accuracy", model = "openai:o3-mini", system, judge, continuous = false, choices, useReasoning = true, fewShotExamples, }) => {
|
|
61
|
+
const scorer = _createLLMAsJudgeScorer({
|
|
62
|
+
prompt,
|
|
63
|
+
judge,
|
|
64
|
+
model,
|
|
65
|
+
system,
|
|
66
|
+
continuous,
|
|
67
|
+
choices,
|
|
68
|
+
useReasoning,
|
|
69
|
+
fewShotExamples,
|
|
70
|
+
});
|
|
71
|
+
const wrappedEvaluator = async ({ inputs, outputs, referenceOutputs, rubric, ...extra }) => {
|
|
72
|
+
const [formattedOutputs, formattedReferenceOutputs, formattedInputs, formattedRubric,] = _formatInputs({ inputs, outputs, referenceOutputs, rubric });
|
|
73
|
+
return _runEvaluator(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, {
|
|
74
|
+
outputs: formattedOutputs,
|
|
75
|
+
referenceOutputs: formattedReferenceOutputs,
|
|
76
|
+
inputs: formattedInputs,
|
|
77
|
+
rubric: formattedRubric,
|
|
78
|
+
...extra,
|
|
79
|
+
});
|
|
80
|
+
};
|
|
81
|
+
return wrappedEvaluator;
|
|
82
|
+
};
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.trajectoryStrictMatch = void 0;
|
|
4
|
+
const utils_js_1 = require("../utils.cjs");
|
|
5
|
+
function _scorer(params) {
|
|
6
|
+
const { outputs, referenceOutputs } = params;
|
|
7
|
+
const normalizedOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
|
|
8
|
+
const normalizedReferenceOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs);
|
|
9
|
+
if (!normalizedOutputs || !normalizedReferenceOutputs) {
|
|
10
|
+
throw new Error("Strict trajectory match requires both outputs and reference_outputs");
|
|
11
|
+
}
|
|
12
|
+
if (normalizedOutputs.length !== normalizedReferenceOutputs.length) {
|
|
13
|
+
return false;
|
|
14
|
+
}
|
|
15
|
+
let exactMatch = true;
|
|
16
|
+
for (let i = 0; i < normalizedOutputs.length; i++) {
|
|
17
|
+
const output = normalizedOutputs[i];
|
|
18
|
+
const referenceOutput = normalizedReferenceOutputs[i];
|
|
19
|
+
if (output.role !== referenceOutput.role) {
|
|
20
|
+
exactMatch = false;
|
|
21
|
+
break;
|
|
22
|
+
}
|
|
23
|
+
const outputHasToolCalls = output.tool_calls != null;
|
|
24
|
+
const referenceHasToolCalls = referenceOutput.tool_calls != null;
|
|
25
|
+
if (outputHasToolCalls !== referenceHasToolCalls) {
|
|
26
|
+
exactMatch = false;
|
|
27
|
+
break;
|
|
28
|
+
}
|
|
29
|
+
if (outputHasToolCalls) {
|
|
30
|
+
if (output.tool_calls.length !== referenceOutput.tool_calls.length) {
|
|
31
|
+
exactMatch = false;
|
|
32
|
+
break;
|
|
33
|
+
}
|
|
34
|
+
for (let j = 0; j < output.tool_calls.length; j++) {
|
|
35
|
+
if (output.tool_calls[j].function.name !==
|
|
36
|
+
referenceOutput.tool_calls[j].function.name) {
|
|
37
|
+
exactMatch = false;
|
|
38
|
+
break;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return exactMatch;
|
|
44
|
+
}
|
|
45
|
+
async function trajectoryStrictMatch(params) {
|
|
46
|
+
/**
|
|
47
|
+
* Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory.
|
|
48
|
+
* This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory.
|
|
49
|
+
*
|
|
50
|
+
* @param outputs - Actual trajectory the agent followed. May be a list of OpenAI messages,
|
|
51
|
+
* a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
|
|
52
|
+
* @param referenceOutputs - Ideal reference trajectory the agent should have followed. May be a list of OpenAI messages,
|
|
53
|
+
* a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
|
|
54
|
+
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
55
|
+
*/
|
|
56
|
+
return (0, utils_js_1._runEvaluator)("trajectory_strict_match", _scorer, "trajectory_strict_match", params);
|
|
57
|
+
}
|
|
58
|
+
exports.trajectoryStrictMatch = trajectoryStrictMatch;
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { BaseMessage } from "@langchain/core/messages";
|
|
2
|
+
import { ChatCompletionMessage, EvaluatorResult } from "../types.js";
|
|
3
|
+
export declare function trajectoryStrictMatch(params: {
|
|
4
|
+
outputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
5
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
6
|
+
};
|
|
7
|
+
referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
8
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
9
|
+
};
|
|
10
|
+
}): Promise<EvaluatorResult>;
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
|
|
2
|
+
function _scorer(params) {
|
|
3
|
+
const { outputs, referenceOutputs } = params;
|
|
4
|
+
const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs);
|
|
5
|
+
const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList(referenceOutputs);
|
|
6
|
+
if (!normalizedOutputs || !normalizedReferenceOutputs) {
|
|
7
|
+
throw new Error("Strict trajectory match requires both outputs and reference_outputs");
|
|
8
|
+
}
|
|
9
|
+
if (normalizedOutputs.length !== normalizedReferenceOutputs.length) {
|
|
10
|
+
return false;
|
|
11
|
+
}
|
|
12
|
+
let exactMatch = true;
|
|
13
|
+
for (let i = 0; i < normalizedOutputs.length; i++) {
|
|
14
|
+
const output = normalizedOutputs[i];
|
|
15
|
+
const referenceOutput = normalizedReferenceOutputs[i];
|
|
16
|
+
if (output.role !== referenceOutput.role) {
|
|
17
|
+
exactMatch = false;
|
|
18
|
+
break;
|
|
19
|
+
}
|
|
20
|
+
const outputHasToolCalls = output.tool_calls != null;
|
|
21
|
+
const referenceHasToolCalls = referenceOutput.tool_calls != null;
|
|
22
|
+
if (outputHasToolCalls !== referenceHasToolCalls) {
|
|
23
|
+
exactMatch = false;
|
|
24
|
+
break;
|
|
25
|
+
}
|
|
26
|
+
if (outputHasToolCalls) {
|
|
27
|
+
if (output.tool_calls.length !== referenceOutput.tool_calls.length) {
|
|
28
|
+
exactMatch = false;
|
|
29
|
+
break;
|
|
30
|
+
}
|
|
31
|
+
for (let j = 0; j < output.tool_calls.length; j++) {
|
|
32
|
+
if (output.tool_calls[j].function.name !==
|
|
33
|
+
referenceOutput.tool_calls[j].function.name) {
|
|
34
|
+
exactMatch = false;
|
|
35
|
+
break;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return exactMatch;
|
|
41
|
+
}
|
|
42
|
+
export async function trajectoryStrictMatch(params) {
|
|
43
|
+
/**
|
|
44
|
+
* Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory.
|
|
45
|
+
* This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory.
|
|
46
|
+
*
|
|
47
|
+
* @param outputs - Actual trajectory the agent followed. May be a list of OpenAI messages,
|
|
48
|
+
* a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
|
|
49
|
+
* @param referenceOutputs - Ideal reference trajectory the agent should have followed. May be a list of OpenAI messages,
|
|
50
|
+
* a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
|
|
51
|
+
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
52
|
+
*/
|
|
53
|
+
return _runEvaluator("trajectory_strict_match", _scorer, "trajectory_strict_match", params);
|
|
54
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.trajectorySubset = void 0;
|
|
4
|
+
const utils_js_1 = require("../utils.cjs");
|
|
5
|
+
const utils_js_2 = require("./utils.cjs");
|
|
6
|
+
/**
|
|
7
|
+
* Evaluate whether an agent trajectory and called tools is a subset of a reference trajectory and called tools.
|
|
8
|
+
* This means the agent called a subset of the tools specified in the reference trajectory.
|
|
9
|
+
*
|
|
10
|
+
* @param params - The parameters for trajectory subset evaluation
|
|
11
|
+
* @param params.outputs - Actual trajectory the agent followed.
|
|
12
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
13
|
+
* a "messages" key with one of the above.
|
|
14
|
+
* @param params.reference_outputs - Ideal reference trajectory the agent should have followed.
|
|
15
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
16
|
+
* a "messages" key with one of the above.
|
|
17
|
+
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
18
|
+
*/
|
|
19
|
+
async function trajectorySubset(params) {
|
|
20
|
+
const { outputs, referenceOutputs } = params;
|
|
21
|
+
const outputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
|
|
22
|
+
const referenceOutputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs);
|
|
23
|
+
const getScore = async () => {
|
|
24
|
+
if (outputsList == null || referenceOutputsList == null) {
|
|
25
|
+
throw new Error("Trajectory subset match requires both outputs and reference_outputs");
|
|
26
|
+
}
|
|
27
|
+
const isSubset = (0, utils_js_2._isTrajectorySuperset)(referenceOutputsList, outputsList);
|
|
28
|
+
return isSubset;
|
|
29
|
+
};
|
|
30
|
+
return (0, utils_js_1._runEvaluator)("trajectory_subset", getScore, "trajectory_subset", params);
|
|
31
|
+
}
|
|
32
|
+
exports.trajectorySubset = trajectorySubset;
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { BaseMessage } from "@langchain/core/messages";
|
|
2
|
+
import { ChatCompletionMessage, EvaluatorResult } from "../types.js";
|
|
3
|
+
/**
|
|
4
|
+
* Evaluate whether an agent trajectory and called tools is a subset of a reference trajectory and called tools.
|
|
5
|
+
* This means the agent called a subset of the tools specified in the reference trajectory.
|
|
6
|
+
*
|
|
7
|
+
* @param params - The parameters for trajectory subset evaluation
|
|
8
|
+
* @param params.outputs - Actual trajectory the agent followed.
|
|
9
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
10
|
+
* a "messages" key with one of the above.
|
|
11
|
+
* @param params.reference_outputs - Ideal reference trajectory the agent should have followed.
|
|
12
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
13
|
+
* a "messages" key with one of the above.
|
|
14
|
+
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
15
|
+
*/
|
|
16
|
+
export declare function trajectorySubset(params: {
|
|
17
|
+
outputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
18
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
19
|
+
};
|
|
20
|
+
referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
21
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
22
|
+
};
|
|
23
|
+
}): Promise<EvaluatorResult>;
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
|
|
2
|
+
import { _isTrajectorySuperset } from "./utils.js";
|
|
3
|
+
/**
|
|
4
|
+
* Evaluate whether an agent trajectory and called tools is a subset of a reference trajectory and called tools.
|
|
5
|
+
* This means the agent called a subset of the tools specified in the reference trajectory.
|
|
6
|
+
*
|
|
7
|
+
* @param params - The parameters for trajectory subset evaluation
|
|
8
|
+
* @param params.outputs - Actual trajectory the agent followed.
|
|
9
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
10
|
+
* a "messages" key with one of the above.
|
|
11
|
+
* @param params.reference_outputs - Ideal reference trajectory the agent should have followed.
|
|
12
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
13
|
+
* a "messages" key with one of the above.
|
|
14
|
+
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
15
|
+
*/
|
|
16
|
+
export async function trajectorySubset(params) {
|
|
17
|
+
const { outputs, referenceOutputs } = params;
|
|
18
|
+
const outputsList = _normalizeToOpenAIMessagesList(outputs);
|
|
19
|
+
const referenceOutputsList = _normalizeToOpenAIMessagesList(referenceOutputs);
|
|
20
|
+
const getScore = async () => {
|
|
21
|
+
if (outputsList == null || referenceOutputsList == null) {
|
|
22
|
+
throw new Error("Trajectory subset match requires both outputs and reference_outputs");
|
|
23
|
+
}
|
|
24
|
+
const isSubset = _isTrajectorySuperset(referenceOutputsList, outputsList);
|
|
25
|
+
return isSubset;
|
|
26
|
+
};
|
|
27
|
+
return _runEvaluator("trajectory_subset", getScore, "trajectory_subset", params);
|
|
28
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.trajectorySuperset = void 0;
|
|
4
|
+
const utils_js_1 = require("../utils.cjs");
|
|
5
|
+
const utils_js_2 = require("./utils.cjs");
|
|
6
|
+
/**
|
|
7
|
+
* Evaluate whether an agent trajectory and called tools is a superset of a reference trajectory and called tools.
|
|
8
|
+
* This means the agent called a superset of the tools specified in the reference trajectory.
|
|
9
|
+
*
|
|
10
|
+
* @param params - The parameters for trajectory superset evaluation
|
|
11
|
+
* @param params.outputs - Actual trajectory the agent followed.
|
|
12
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
13
|
+
* a "messages" key with one of the above.
|
|
14
|
+
* @param params.reference_outputs - Ideal reference trajectory the agent should have followed.
|
|
15
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
16
|
+
* a "messages" key with one of the above.
|
|
17
|
+
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
18
|
+
*/
|
|
19
|
+
async function trajectorySuperset(params) {
|
|
20
|
+
const { outputs, referenceOutputs } = params;
|
|
21
|
+
const outputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
|
|
22
|
+
const referenceOutputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs);
|
|
23
|
+
const getScore = async () => {
|
|
24
|
+
if (outputsList == null || referenceOutputsList == null) {
|
|
25
|
+
throw new Error("Trajectory superset match requires both outputs and reference_outputs");
|
|
26
|
+
}
|
|
27
|
+
const isSuperset = (0, utils_js_2._isTrajectorySuperset)(outputsList, referenceOutputsList);
|
|
28
|
+
return isSuperset;
|
|
29
|
+
};
|
|
30
|
+
return (0, utils_js_1._runEvaluator)("trajectory_superset", getScore, "trajectory_superset", params);
|
|
31
|
+
}
|
|
32
|
+
exports.trajectorySuperset = trajectorySuperset;
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { BaseMessage } from "@langchain/core/messages";
|
|
2
|
+
import { ChatCompletionMessage, EvaluatorResult } from "../types.js";
|
|
3
|
+
/**
|
|
4
|
+
* Evaluate whether an agent trajectory and called tools is a superset of a reference trajectory and called tools.
|
|
5
|
+
* This means the agent called a superset of the tools specified in the reference trajectory.
|
|
6
|
+
*
|
|
7
|
+
* @param params - The parameters for trajectory superset evaluation
|
|
8
|
+
* @param params.outputs - Actual trajectory the agent followed.
|
|
9
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
10
|
+
* a "messages" key with one of the above.
|
|
11
|
+
* @param params.reference_outputs - Ideal reference trajectory the agent should have followed.
|
|
12
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
13
|
+
* a "messages" key with one of the above.
|
|
14
|
+
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
15
|
+
*/
|
|
16
|
+
export declare function trajectorySuperset(params: {
|
|
17
|
+
outputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
18
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
19
|
+
};
|
|
20
|
+
referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
21
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
22
|
+
};
|
|
23
|
+
}): Promise<EvaluatorResult>;
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
|
|
2
|
+
import { _isTrajectorySuperset } from "./utils.js";
|
|
3
|
+
/**
|
|
4
|
+
* Evaluate whether an agent trajectory and called tools is a superset of a reference trajectory and called tools.
|
|
5
|
+
* This means the agent called a superset of the tools specified in the reference trajectory.
|
|
6
|
+
*
|
|
7
|
+
* @param params - The parameters for trajectory superset evaluation
|
|
8
|
+
* @param params.outputs - Actual trajectory the agent followed.
|
|
9
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
10
|
+
* a "messages" key with one of the above.
|
|
11
|
+
* @param params.reference_outputs - Ideal reference trajectory the agent should have followed.
|
|
12
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
13
|
+
* a "messages" key with one of the above.
|
|
14
|
+
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
15
|
+
*/
|
|
16
|
+
export async function trajectorySuperset(params) {
|
|
17
|
+
const { outputs, referenceOutputs } = params;
|
|
18
|
+
const outputsList = _normalizeToOpenAIMessagesList(outputs);
|
|
19
|
+
const referenceOutputsList = _normalizeToOpenAIMessagesList(referenceOutputs);
|
|
20
|
+
const getScore = async () => {
|
|
21
|
+
if (outputsList == null || referenceOutputsList == null) {
|
|
22
|
+
throw new Error("Trajectory superset match requires both outputs and reference_outputs");
|
|
23
|
+
}
|
|
24
|
+
const isSuperset = _isTrajectorySuperset(outputsList, referenceOutputsList);
|
|
25
|
+
return isSuperset;
|
|
26
|
+
};
|
|
27
|
+
return _runEvaluator("trajectory_superset", getScore, "trajectory_superset", params);
|
|
28
|
+
}
|