agentevals 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +141 -36
- package/dist/index.cjs +3 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.js +1 -0
- package/dist/trajectory/llm.cjs +18 -29
- package/dist/trajectory/llm.d.ts +2 -3
- package/dist/trajectory/llm.js +18 -29
- package/dist/trajectory/match.cjs +84 -0
- package/dist/trajectory/match.d.ts +61 -0
- package/dist/trajectory/match.js +80 -0
- package/dist/trajectory/strict.cjs +42 -42
- package/dist/trajectory/strict.d.ts +23 -2
- package/dist/trajectory/strict.js +40 -41
- package/dist/trajectory/subset.cjs +13 -9
- package/dist/trajectory/subset.d.ts +8 -1
- package/dist/trajectory/subset.js +11 -8
- package/dist/trajectory/superset.cjs +13 -9
- package/dist/trajectory/superset.d.ts +8 -1
- package/dist/trajectory/superset.js +11 -8
- package/dist/trajectory/unordered.cjs +14 -10
- package/dist/trajectory/unordered.d.ts +8 -1
- package/dist/trajectory/unordered.js +12 -9
- package/dist/trajectory/utils.cjs +107 -18
- package/dist/trajectory/utils.d.ts +3 -2
- package/dist/trajectory/utils.js +105 -17
- package/dist/types.d.ts +3 -0
- package/package.json +1 -1
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { BaseMessage } from "@langchain/core/messages";
|
|
2
|
+
import { ChatCompletionMessage, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
|
|
3
|
+
export type TrajectoryMatchMode = "strict" | "unordered" | "subset" | "superset";
|
|
4
|
+
/**
|
|
5
|
+
* Creates an evaluator that compares trajectories between model outputs and reference outputs.
|
|
6
|
+
*
|
|
7
|
+
* @param options - The configuration options
|
|
8
|
+
* @param options.trajectoryMatchMode - The mode for matching trajectories:
|
|
9
|
+
* - `"strict"`: Requires exact match in order and content
|
|
10
|
+
* - `"unordered"`: Allows matching in any order
|
|
11
|
+
* - `"subset"`: Accepts if output trajectory is a subset of reference
|
|
12
|
+
* - `"superset"`: Accepts if output trajectory is a superset of reference
|
|
13
|
+
* @param options.toolArgsMatchMode - Mode for matching tool arguments ("exact" by default, can be "ignore")
|
|
14
|
+
* @param options.toolArgsMatchOverrides - Object containing custom overrides for tool argument matching.
|
|
15
|
+
* Each key should be a tool name, and each value should be either a match mode or a matcher function.
|
|
16
|
+
* Matchers should be a function that takes two sets of tool call args and returns whether they are equal.
|
|
17
|
+
*
|
|
18
|
+
* @returns An async function that evaluates trajectory matches between outputs and references.
|
|
19
|
+
* The returned evaluator accepts:
|
|
20
|
+
* - outputs: List of messages or dict representing the model output trajectory
|
|
21
|
+
* - referenceOutputs: List of messages or dict representing the reference trajectory
|
|
22
|
+
* - Additional arguments passed to the underlying evaluator
|
|
23
|
+
*
|
|
24
|
+
* @example
|
|
25
|
+
* ```typescript
|
|
26
|
+
* const matcher = (
|
|
27
|
+
* outputToolCallArgs: Record<string, any>,
|
|
28
|
+
* referenceToolCallArgs: Record<string, any>
|
|
29
|
+
* ): boolean => {
|
|
30
|
+
* const outputArgs = (outputToolCallArgs.query ?? "").toLowerCase();
|
|
31
|
+
* const referenceArgs = (referenceToolCallArgs.query ?? "").toLowerCase();
|
|
32
|
+
* return outputArgs === referenceArgs;
|
|
33
|
+
* };
|
|
34
|
+
*
|
|
35
|
+
* const evaluator = createAsyncTrajectoryMatchEvaluator({
|
|
36
|
+
* trajectoryMatchMode: "strict",
|
|
37
|
+
* toolArgsMatchMode: "exact",
|
|
38
|
+
* toolArgsMatchOverrides: {
|
|
39
|
+
* myToolName: matcher,
|
|
40
|
+
* },
|
|
41
|
+
* });
|
|
42
|
+
*
|
|
43
|
+
* const result = await evaluator({
|
|
44
|
+
* outputs: [...],
|
|
45
|
+
* referenceOutputs: [...],
|
|
46
|
+
* });
|
|
47
|
+
* ```
|
|
48
|
+
*/
|
|
49
|
+
export declare function createTrajectoryMatchEvaluator({ trajectoryMatchMode, toolArgsMatchMode, toolArgsMatchOverrides, }: {
|
|
50
|
+
trajectoryMatchMode?: TrajectoryMatchMode;
|
|
51
|
+
toolArgsMatchMode?: ToolArgsMatchMode;
|
|
52
|
+
toolArgsMatchOverrides?: ToolArgsMatchOverrides;
|
|
53
|
+
}): ({ outputs, referenceOutputs, ...extra }: {
|
|
54
|
+
[key: string]: unknown;
|
|
55
|
+
outputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
56
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
57
|
+
};
|
|
58
|
+
referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
59
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
60
|
+
};
|
|
61
|
+
}) => Promise<import("langsmith/vitest").SimpleEvaluationResult>;
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
|
|
2
|
+
import { _scorer as trajectoryStrictScorer } from "./strict.js";
|
|
3
|
+
import { _scorer as trajectoryUnorderedScorer } from "./unordered.js";
|
|
4
|
+
import { _scorer as trajectorySubsetScorer } from "./subset.js";
|
|
5
|
+
import { _scorer as trajectorySuperstScorer } from "./superset.js";
|
|
6
|
+
/**
|
|
7
|
+
* Creates an evaluator that compares trajectories between model outputs and reference outputs.
|
|
8
|
+
*
|
|
9
|
+
* @param options - The configuration options
|
|
10
|
+
* @param options.trajectoryMatchMode - The mode for matching trajectories:
|
|
11
|
+
* - `"strict"`: Requires exact match in order and content
|
|
12
|
+
* - `"unordered"`: Allows matching in any order
|
|
13
|
+
* - `"subset"`: Accepts if output trajectory is a subset of reference
|
|
14
|
+
* - `"superset"`: Accepts if output trajectory is a superset of reference
|
|
15
|
+
* @param options.toolArgsMatchMode - Mode for matching tool arguments ("exact" by default, can be "ignore")
|
|
16
|
+
* @param options.toolArgsMatchOverrides - Object containing custom overrides for tool argument matching.
|
|
17
|
+
* Each key should be a tool name, and each value should be either a match mode or a matcher function.
|
|
18
|
+
* Matchers should be a function that takes two sets of tool call args and returns whether they are equal.
|
|
19
|
+
*
|
|
20
|
+
* @returns An async function that evaluates trajectory matches between outputs and references.
|
|
21
|
+
* The returned evaluator accepts:
|
|
22
|
+
* - outputs: List of messages or dict representing the model output trajectory
|
|
23
|
+
* - referenceOutputs: List of messages or dict representing the reference trajectory
|
|
24
|
+
* - Additional arguments passed to the underlying evaluator
|
|
25
|
+
*
|
|
26
|
+
* @example
|
|
27
|
+
* ```typescript
|
|
28
|
+
* const matcher = (
|
|
29
|
+
* outputToolCallArgs: Record<string, any>,
|
|
30
|
+
* referenceToolCallArgs: Record<string, any>
|
|
31
|
+
* ): boolean => {
|
|
32
|
+
* const outputArgs = (outputToolCallArgs.query ?? "").toLowerCase();
|
|
33
|
+
* const referenceArgs = (referenceToolCallArgs.query ?? "").toLowerCase();
|
|
34
|
+
* return outputArgs === referenceArgs;
|
|
35
|
+
* };
|
|
36
|
+
*
|
|
37
|
+
* const evaluator = createAsyncTrajectoryMatchEvaluator({
|
|
38
|
+
* trajectoryMatchMode: "strict",
|
|
39
|
+
* toolArgsMatchMode: "exact",
|
|
40
|
+
* toolArgsMatchOverrides: {
|
|
41
|
+
* myToolName: matcher,
|
|
42
|
+
* },
|
|
43
|
+
* });
|
|
44
|
+
*
|
|
45
|
+
* const result = await evaluator({
|
|
46
|
+
* outputs: [...],
|
|
47
|
+
* referenceOutputs: [...],
|
|
48
|
+
* });
|
|
49
|
+
* ```
|
|
50
|
+
*/
|
|
51
|
+
export function createTrajectoryMatchEvaluator({ trajectoryMatchMode = "strict", toolArgsMatchMode = "exact", toolArgsMatchOverrides, }) {
|
|
52
|
+
let scorer;
|
|
53
|
+
switch (trajectoryMatchMode) {
|
|
54
|
+
case "strict":
|
|
55
|
+
scorer = trajectoryStrictScorer;
|
|
56
|
+
break;
|
|
57
|
+
case "unordered":
|
|
58
|
+
scorer = trajectoryUnorderedScorer;
|
|
59
|
+
break;
|
|
60
|
+
case "subset":
|
|
61
|
+
scorer = trajectorySubsetScorer;
|
|
62
|
+
break;
|
|
63
|
+
case "superset":
|
|
64
|
+
scorer = trajectorySuperstScorer;
|
|
65
|
+
break;
|
|
66
|
+
default:
|
|
67
|
+
throw new Error(`Invalid trajectory match type: ${trajectoryMatchMode}`);
|
|
68
|
+
}
|
|
69
|
+
return async function _wrappedEvaluator({ outputs, referenceOutputs, ...extra }) {
|
|
70
|
+
const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs);
|
|
71
|
+
const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList(referenceOutputs);
|
|
72
|
+
return _runEvaluator(`trajectory_${trajectoryMatchMode}_match`, scorer, `trajectory_${trajectoryMatchMode}_match`, {
|
|
73
|
+
outputs: normalizedOutputs,
|
|
74
|
+
referenceOutputs: normalizedReferenceOutputs,
|
|
75
|
+
toolArgsMatchMode,
|
|
76
|
+
toolArgsMatchOverrides,
|
|
77
|
+
...extra,
|
|
78
|
+
});
|
|
79
|
+
};
|
|
80
|
+
}
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.trajectoryStrictMatch = void 0;
|
|
3
|
+
exports.trajectoryStrictMatch = exports._scorer = void 0;
|
|
4
4
|
const utils_js_1 = require("../utils.cjs");
|
|
5
|
-
|
|
6
|
-
|
|
5
|
+
const utils_js_2 = require("./utils.cjs");
|
|
6
|
+
async function _scorer(params) {
|
|
7
|
+
const { outputs, referenceOutputs, toolArgsMatchMode, toolArgsMatchOverrides, } = params;
|
|
7
8
|
const normalizedOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
|
|
8
9
|
const normalizedReferenceOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs);
|
|
9
10
|
if (!normalizedOutputs || !normalizedReferenceOutputs) {
|
|
@@ -12,63 +13,62 @@ function _scorer(params) {
|
|
|
12
13
|
if (normalizedOutputs.length !== normalizedReferenceOutputs.length) {
|
|
13
14
|
return false;
|
|
14
15
|
}
|
|
15
|
-
let exactMatch = true;
|
|
16
16
|
for (let i = 0; i < normalizedOutputs.length; i++) {
|
|
17
17
|
const output = normalizedOutputs[i];
|
|
18
18
|
const referenceOutput = normalizedReferenceOutputs[i];
|
|
19
19
|
if (output.role !== referenceOutput.role) {
|
|
20
|
-
|
|
21
|
-
break;
|
|
20
|
+
return false;
|
|
22
21
|
}
|
|
23
22
|
const outputHasToolCalls = output.tool_calls != null;
|
|
24
23
|
const referenceHasToolCalls = referenceOutput.tool_calls != null;
|
|
25
24
|
if (outputHasToolCalls !== referenceHasToolCalls) {
|
|
26
|
-
|
|
27
|
-
break;
|
|
25
|
+
return false;
|
|
28
26
|
}
|
|
29
27
|
if (outputHasToolCalls) {
|
|
30
28
|
if (output.tool_calls.length !== referenceOutput.tool_calls.length) {
|
|
31
|
-
|
|
32
|
-
break;
|
|
29
|
+
return false;
|
|
33
30
|
}
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
31
|
+
const referenceCalls = referenceOutput.tool_calls ?? [];
|
|
32
|
+
const seen = new Array(referenceCalls.length).fill(false);
|
|
33
|
+
for (const outputCall of output.tool_calls ?? []) {
|
|
34
|
+
let foundMatch = false;
|
|
35
|
+
for (let i = 0; i < referenceCalls.length; i++) {
|
|
36
|
+
const referenceCall = referenceCalls[i];
|
|
37
|
+
if (!seen[i] &&
|
|
38
|
+
outputCall.function?.name === referenceCall.function?.name) {
|
|
39
|
+
const matcher = (0, utils_js_2._getMatcherForToolName)(outputCall.function?.name ?? "", toolArgsMatchMode, toolArgsMatchOverrides);
|
|
40
|
+
if (await matcher(JSON.parse(outputCall.function?.arguments ?? "{}"), JSON.parse(referenceCall.function?.arguments ?? "{}"))) {
|
|
41
|
+
foundMatch = true;
|
|
42
|
+
seen[i] = true;
|
|
43
|
+
break;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
39
46
|
}
|
|
40
|
-
if (
|
|
41
|
-
|
|
42
|
-
referenceOutput.tool_calls[j].function.arguments) {
|
|
43
|
-
exactMatch = false;
|
|
44
|
-
break;
|
|
47
|
+
if (!foundMatch) {
|
|
48
|
+
return false;
|
|
45
49
|
}
|
|
46
50
|
}
|
|
47
51
|
}
|
|
48
|
-
if (messageContentExactMatch &&
|
|
49
|
-
output.content !== referenceOutput.content) {
|
|
50
|
-
exactMatch = false;
|
|
51
|
-
break;
|
|
52
|
-
}
|
|
53
52
|
}
|
|
54
|
-
return
|
|
53
|
+
return true;
|
|
55
54
|
}
|
|
55
|
+
exports._scorer = _scorer;
|
|
56
|
+
/**
|
|
57
|
+
* @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "strict"` instead.
|
|
58
|
+
* Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory.
|
|
59
|
+
* This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory.
|
|
60
|
+
*
|
|
61
|
+
* @param outputs - Actual trajectory the agent followed. May be a list of OpenAI messages,
|
|
62
|
+
* a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
|
|
63
|
+
* @param referenceOutputs - Ideal reference trajectory the agent should have followed. May be a list of OpenAI messages,
|
|
64
|
+
* a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
|
|
65
|
+
* @param toolCallArgsExactMatch - Whether to require exact matches for tool call arguments
|
|
66
|
+
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
67
|
+
*/
|
|
56
68
|
async function trajectoryStrictMatch(params) {
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
* @param outputs - Actual trajectory the agent followed. May be a list of OpenAI messages,
|
|
62
|
-
* a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
|
|
63
|
-
* @param referenceOutputs - Ideal reference trajectory the agent should have followed. May be a list of OpenAI messages,
|
|
64
|
-
* a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
|
|
65
|
-
* @param toolCallArgsExactMatch - Whether to require exact matches for tool call arguments
|
|
66
|
-
* @param messageContentExactMatch - Whether to require exact matches for message content
|
|
67
|
-
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
68
|
-
*/
|
|
69
|
-
function _wrapper() {
|
|
70
|
-
return _scorer(params);
|
|
71
|
-
}
|
|
72
|
-
return (0, utils_js_1._runEvaluator)("trajectory_strict_match", _wrapper, "trajectory_strict_match", params);
|
|
69
|
+
return (0, utils_js_1._runEvaluator)("trajectory_strict_match", _scorer, "trajectory_strict_match", {
|
|
70
|
+
...params,
|
|
71
|
+
toolArgsMatchMode: params.toolCallArgsExactMatch ? "exact" : "ignore",
|
|
72
|
+
});
|
|
73
73
|
}
|
|
74
74
|
exports.trajectoryStrictMatch = trajectoryStrictMatch;
|
|
@@ -1,5 +1,27 @@
|
|
|
1
1
|
import { BaseMessage } from "@langchain/core/messages";
|
|
2
|
-
import { ChatCompletionMessage, EvaluatorResult } from "../types.js";
|
|
2
|
+
import { ChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
|
|
3
|
+
export declare function _scorer(params: {
|
|
4
|
+
outputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
5
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
6
|
+
};
|
|
7
|
+
referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
8
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
9
|
+
};
|
|
10
|
+
toolArgsMatchMode: ToolArgsMatchMode;
|
|
11
|
+
toolArgsMatchOverrides?: ToolArgsMatchOverrides;
|
|
12
|
+
}): Promise<boolean>;
|
|
13
|
+
/**
|
|
14
|
+
* @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "strict"` instead.
|
|
15
|
+
* Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory.
|
|
16
|
+
* This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory.
|
|
17
|
+
*
|
|
18
|
+
* @param outputs - Actual trajectory the agent followed. May be a list of OpenAI messages,
|
|
19
|
+
* a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
|
|
20
|
+
* @param referenceOutputs - Ideal reference trajectory the agent should have followed. May be a list of OpenAI messages,
|
|
21
|
+
* a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
|
|
22
|
+
* @param toolCallArgsExactMatch - Whether to require exact matches for tool call arguments
|
|
23
|
+
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
24
|
+
*/
|
|
3
25
|
export declare function trajectoryStrictMatch(params: {
|
|
4
26
|
outputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
5
27
|
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
@@ -8,5 +30,4 @@ export declare function trajectoryStrictMatch(params: {
|
|
|
8
30
|
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
9
31
|
};
|
|
10
32
|
toolCallArgsExactMatch: boolean;
|
|
11
|
-
messageContentExactMatch: boolean;
|
|
12
33
|
}): Promise<EvaluatorResult>;
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
|
|
2
|
-
|
|
3
|
-
|
|
2
|
+
import { _getMatcherForToolName } from "./utils.js";
|
|
3
|
+
export async function _scorer(params) {
|
|
4
|
+
const { outputs, referenceOutputs, toolArgsMatchMode, toolArgsMatchOverrides, } = params;
|
|
4
5
|
const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs);
|
|
5
6
|
const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList(referenceOutputs);
|
|
6
7
|
if (!normalizedOutputs || !normalizedReferenceOutputs) {
|
|
@@ -9,62 +10,60 @@ function _scorer(params) {
|
|
|
9
10
|
if (normalizedOutputs.length !== normalizedReferenceOutputs.length) {
|
|
10
11
|
return false;
|
|
11
12
|
}
|
|
12
|
-
let exactMatch = true;
|
|
13
13
|
for (let i = 0; i < normalizedOutputs.length; i++) {
|
|
14
14
|
const output = normalizedOutputs[i];
|
|
15
15
|
const referenceOutput = normalizedReferenceOutputs[i];
|
|
16
16
|
if (output.role !== referenceOutput.role) {
|
|
17
|
-
|
|
18
|
-
break;
|
|
17
|
+
return false;
|
|
19
18
|
}
|
|
20
19
|
const outputHasToolCalls = output.tool_calls != null;
|
|
21
20
|
const referenceHasToolCalls = referenceOutput.tool_calls != null;
|
|
22
21
|
if (outputHasToolCalls !== referenceHasToolCalls) {
|
|
23
|
-
|
|
24
|
-
break;
|
|
22
|
+
return false;
|
|
25
23
|
}
|
|
26
24
|
if (outputHasToolCalls) {
|
|
27
25
|
if (output.tool_calls.length !== referenceOutput.tool_calls.length) {
|
|
28
|
-
|
|
29
|
-
break;
|
|
26
|
+
return false;
|
|
30
27
|
}
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
28
|
+
const referenceCalls = referenceOutput.tool_calls ?? [];
|
|
29
|
+
const seen = new Array(referenceCalls.length).fill(false);
|
|
30
|
+
for (const outputCall of output.tool_calls ?? []) {
|
|
31
|
+
let foundMatch = false;
|
|
32
|
+
for (let i = 0; i < referenceCalls.length; i++) {
|
|
33
|
+
const referenceCall = referenceCalls[i];
|
|
34
|
+
if (!seen[i] &&
|
|
35
|
+
outputCall.function?.name === referenceCall.function?.name) {
|
|
36
|
+
const matcher = _getMatcherForToolName(outputCall.function?.name ?? "", toolArgsMatchMode, toolArgsMatchOverrides);
|
|
37
|
+
if (await matcher(JSON.parse(outputCall.function?.arguments ?? "{}"), JSON.parse(referenceCall.function?.arguments ?? "{}"))) {
|
|
38
|
+
foundMatch = true;
|
|
39
|
+
seen[i] = true;
|
|
40
|
+
break;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
36
43
|
}
|
|
37
|
-
if (
|
|
38
|
-
|
|
39
|
-
referenceOutput.tool_calls[j].function.arguments) {
|
|
40
|
-
exactMatch = false;
|
|
41
|
-
break;
|
|
44
|
+
if (!foundMatch) {
|
|
45
|
+
return false;
|
|
42
46
|
}
|
|
43
47
|
}
|
|
44
48
|
}
|
|
45
|
-
if (messageContentExactMatch &&
|
|
46
|
-
output.content !== referenceOutput.content) {
|
|
47
|
-
exactMatch = false;
|
|
48
|
-
break;
|
|
49
|
-
}
|
|
50
49
|
}
|
|
51
|
-
return
|
|
50
|
+
return true;
|
|
52
51
|
}
|
|
52
|
+
/**
|
|
53
|
+
* @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "strict"` instead.
|
|
54
|
+
* Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory.
|
|
55
|
+
* This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory.
|
|
56
|
+
*
|
|
57
|
+
* @param outputs - Actual trajectory the agent followed. May be a list of OpenAI messages,
|
|
58
|
+
* a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
|
|
59
|
+
* @param referenceOutputs - Ideal reference trajectory the agent should have followed. May be a list of OpenAI messages,
|
|
60
|
+
* a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
|
|
61
|
+
* @param toolCallArgsExactMatch - Whether to require exact matches for tool call arguments
|
|
62
|
+
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
63
|
+
*/
|
|
53
64
|
export async function trajectoryStrictMatch(params) {
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
* @param outputs - Actual trajectory the agent followed. May be a list of OpenAI messages,
|
|
59
|
-
* a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
|
|
60
|
-
* @param referenceOutputs - Ideal reference trajectory the agent should have followed. May be a list of OpenAI messages,
|
|
61
|
-
* a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
|
|
62
|
-
* @param toolCallArgsExactMatch - Whether to require exact matches for tool call arguments
|
|
63
|
-
* @param messageContentExactMatch - Whether to require exact matches for message content
|
|
64
|
-
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
65
|
-
*/
|
|
66
|
-
function _wrapper() {
|
|
67
|
-
return _scorer(params);
|
|
68
|
-
}
|
|
69
|
-
return _runEvaluator("trajectory_strict_match", _wrapper, "trajectory_strict_match", params);
|
|
65
|
+
return _runEvaluator("trajectory_strict_match", _scorer, "trajectory_strict_match", {
|
|
66
|
+
...params,
|
|
67
|
+
toolArgsMatchMode: params.toolCallArgsExactMatch ? "exact" : "ignore",
|
|
68
|
+
});
|
|
70
69
|
}
|
|
@@ -1,9 +1,15 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.trajectorySubset = void 0;
|
|
3
|
+
exports.trajectorySubset = exports._scorer = void 0;
|
|
4
4
|
const utils_js_1 = require("../utils.cjs");
|
|
5
5
|
const utils_js_2 = require("./utils.cjs");
|
|
6
|
+
const _scorer = async (params) => {
|
|
7
|
+
const isSubset = await (0, utils_js_2._isTrajectorySuperset)(params.referenceOutputs, params.outputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides);
|
|
8
|
+
return isSubset;
|
|
9
|
+
};
|
|
10
|
+
exports._scorer = _scorer;
|
|
6
11
|
/**
|
|
12
|
+
* @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "subset"` instead.
|
|
7
13
|
* Evaluate whether an agent trajectory and called tools is a subset of a reference trajectory and called tools.
|
|
8
14
|
* This means the agent called a subset of the tools specified in the reference trajectory.
|
|
9
15
|
*
|
|
@@ -20,13 +26,11 @@ async function trajectorySubset(params) {
|
|
|
20
26
|
const { outputs, referenceOutputs } = params;
|
|
21
27
|
const outputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
|
|
22
28
|
const referenceOutputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs);
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
};
|
|
30
|
-
return (0, utils_js_1._runEvaluator)("trajectory_subset", getScore, "trajectory_subset", params);
|
|
29
|
+
return (0, utils_js_1._runEvaluator)("trajectory_subset", exports._scorer, "trajectory_subset", {
|
|
30
|
+
...params,
|
|
31
|
+
outputs: outputsList,
|
|
32
|
+
referenceOutputs: referenceOutputsList,
|
|
33
|
+
toolArgsMatchMode: "ignore",
|
|
34
|
+
});
|
|
31
35
|
}
|
|
32
36
|
exports.trajectorySubset = trajectorySubset;
|
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
import { BaseMessage } from "@langchain/core/messages";
|
|
2
|
-
import { ChatCompletionMessage, EvaluatorResult } from "../types.js";
|
|
2
|
+
import { ChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
|
|
3
|
+
export declare const _scorer: (params: {
|
|
4
|
+
outputs: ChatCompletionMessage[];
|
|
5
|
+
referenceOutputs: ChatCompletionMessage[];
|
|
6
|
+
toolArgsMatchMode: ToolArgsMatchMode;
|
|
7
|
+
toolArgsMatchOverrides?: ToolArgsMatchOverrides;
|
|
8
|
+
}) => Promise<boolean>;
|
|
3
9
|
/**
|
|
10
|
+
* @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "subset"` instead.
|
|
4
11
|
* Evaluate whether an agent trajectory and called tools is a subset of a reference trajectory and called tools.
|
|
5
12
|
* This means the agent called a subset of the tools specified in the reference trajectory.
|
|
6
13
|
*
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
|
|
2
2
|
import { _isTrajectorySuperset } from "./utils.js";
|
|
3
|
+
export const _scorer = async (params) => {
|
|
4
|
+
const isSubset = await _isTrajectorySuperset(params.referenceOutputs, params.outputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides);
|
|
5
|
+
return isSubset;
|
|
6
|
+
};
|
|
3
7
|
/**
|
|
8
|
+
* @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "subset"` instead.
|
|
4
9
|
* Evaluate whether an agent trajectory and called tools is a subset of a reference trajectory and called tools.
|
|
5
10
|
* This means the agent called a subset of the tools specified in the reference trajectory.
|
|
6
11
|
*
|
|
@@ -17,12 +22,10 @@ export async function trajectorySubset(params) {
|
|
|
17
22
|
const { outputs, referenceOutputs } = params;
|
|
18
23
|
const outputsList = _normalizeToOpenAIMessagesList(outputs);
|
|
19
24
|
const referenceOutputsList = _normalizeToOpenAIMessagesList(referenceOutputs);
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
};
|
|
27
|
-
return _runEvaluator("trajectory_subset", getScore, "trajectory_subset", params);
|
|
25
|
+
return _runEvaluator("trajectory_subset", _scorer, "trajectory_subset", {
|
|
26
|
+
...params,
|
|
27
|
+
outputs: outputsList,
|
|
28
|
+
referenceOutputs: referenceOutputsList,
|
|
29
|
+
toolArgsMatchMode: "ignore",
|
|
30
|
+
});
|
|
28
31
|
}
|
|
@@ -1,9 +1,15 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.trajectorySuperset = void 0;
|
|
3
|
+
exports.trajectorySuperset = exports._scorer = void 0;
|
|
4
4
|
const utils_js_1 = require("../utils.cjs");
|
|
5
5
|
const utils_js_2 = require("./utils.cjs");
|
|
6
|
+
const _scorer = async (params) => {
|
|
7
|
+
const isSuperset = await (0, utils_js_2._isTrajectorySuperset)(params.outputs, params.referenceOutputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides);
|
|
8
|
+
return isSuperset;
|
|
9
|
+
};
|
|
10
|
+
exports._scorer = _scorer;
|
|
6
11
|
/**
|
|
12
|
+
* @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "superset"` instead.
|
|
7
13
|
* Evaluate whether an agent trajectory and called tools is a superset of a reference trajectory and called tools.
|
|
8
14
|
* This means the agent called a superset of the tools specified in the reference trajectory.
|
|
9
15
|
*
|
|
@@ -20,13 +26,11 @@ async function trajectorySuperset(params) {
|
|
|
20
26
|
const { outputs, referenceOutputs } = params;
|
|
21
27
|
const outputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
|
|
22
28
|
const referenceOutputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs);
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
};
|
|
30
|
-
return (0, utils_js_1._runEvaluator)("trajectory_superset", getScore, "trajectory_superset", params);
|
|
29
|
+
return (0, utils_js_1._runEvaluator)("trajectory_superset", exports._scorer, "trajectory_superset", {
|
|
30
|
+
...params,
|
|
31
|
+
outputs: outputsList,
|
|
32
|
+
referenceOutputs: referenceOutputsList,
|
|
33
|
+
toolArgsMatchMode: "ignore",
|
|
34
|
+
});
|
|
31
35
|
}
|
|
32
36
|
exports.trajectorySuperset = trajectorySuperset;
|
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
import { BaseMessage } from "@langchain/core/messages";
|
|
2
|
-
import { ChatCompletionMessage, EvaluatorResult } from "../types.js";
|
|
2
|
+
import { ChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
|
|
3
|
+
export declare const _scorer: (params: {
|
|
4
|
+
outputs: ChatCompletionMessage[];
|
|
5
|
+
referenceOutputs: ChatCompletionMessage[];
|
|
6
|
+
toolArgsMatchMode: ToolArgsMatchMode;
|
|
7
|
+
toolArgsMatchOverrides?: ToolArgsMatchOverrides;
|
|
8
|
+
}) => Promise<boolean>;
|
|
3
9
|
/**
|
|
10
|
+
* @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "superset"` instead.
|
|
4
11
|
* Evaluate whether an agent trajectory and called tools is a superset of a reference trajectory and called tools.
|
|
5
12
|
* This means the agent called a superset of the tools specified in the reference trajectory.
|
|
6
13
|
*
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
|
|
2
2
|
import { _isTrajectorySuperset } from "./utils.js";
|
|
3
|
+
export const _scorer = async (params) => {
|
|
4
|
+
const isSuperset = await _isTrajectorySuperset(params.outputs, params.referenceOutputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides);
|
|
5
|
+
return isSuperset;
|
|
6
|
+
};
|
|
3
7
|
/**
|
|
8
|
+
* @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "superset"` instead.
|
|
4
9
|
* Evaluate whether an agent trajectory and called tools is a superset of a reference trajectory and called tools.
|
|
5
10
|
* This means the agent called a superset of the tools specified in the reference trajectory.
|
|
6
11
|
*
|
|
@@ -17,12 +22,10 @@ export async function trajectorySuperset(params) {
|
|
|
17
22
|
const { outputs, referenceOutputs } = params;
|
|
18
23
|
const outputsList = _normalizeToOpenAIMessagesList(outputs);
|
|
19
24
|
const referenceOutputsList = _normalizeToOpenAIMessagesList(referenceOutputs);
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
};
|
|
27
|
-
return _runEvaluator("trajectory_superset", getScore, "trajectory_superset", params);
|
|
25
|
+
return _runEvaluator("trajectory_superset", _scorer, "trajectory_superset", {
|
|
26
|
+
...params,
|
|
27
|
+
outputs: outputsList,
|
|
28
|
+
referenceOutputs: referenceOutputsList,
|
|
29
|
+
toolArgsMatchMode: "ignore",
|
|
30
|
+
});
|
|
28
31
|
}
|
|
@@ -1,9 +1,16 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.trajectoryUnorderedMatch = void 0;
|
|
3
|
+
exports.trajectoryUnorderedMatch = exports._scorer = void 0;
|
|
4
4
|
const utils_js_1 = require("../utils.cjs");
|
|
5
5
|
const utils_js_2 = require("./utils.cjs");
|
|
6
|
+
const _scorer = async (params) => {
|
|
7
|
+
const isUnorderedMatch = (await (0, utils_js_2._isTrajectorySuperset)(params.outputs, params.referenceOutputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides)) &&
|
|
8
|
+
(await (0, utils_js_2._isTrajectorySuperset)(params.referenceOutputs, params.outputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides));
|
|
9
|
+
return isUnorderedMatch;
|
|
10
|
+
};
|
|
11
|
+
exports._scorer = _scorer;
|
|
6
12
|
/**
|
|
13
|
+
* @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "unordered"` instead.
|
|
7
14
|
* Evaluate whether an input agent trajectory and called tools contains all the tools used in a reference trajectory.
|
|
8
15
|
* This accounts for some differences in an LLM's reasoning process in a case-by-case basis.
|
|
9
16
|
*
|
|
@@ -20,14 +27,11 @@ async function trajectoryUnorderedMatch(params) {
|
|
|
20
27
|
const { outputs, referenceOutputs } = params;
|
|
21
28
|
const outputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
|
|
22
29
|
const referenceOutputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs);
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
return unorderedMatch;
|
|
30
|
-
};
|
|
31
|
-
return (0, utils_js_1._runEvaluator)("trajectory_unordered_match", getScore, "trajectory_unordered_match", params);
|
|
30
|
+
return (0, utils_js_1._runEvaluator)("trajectory_unordered_match", exports._scorer, "trajectory_unordered_match", {
|
|
31
|
+
...params,
|
|
32
|
+
outputs: outputsList,
|
|
33
|
+
referenceOutputs: referenceOutputsList,
|
|
34
|
+
toolArgsMatchMode: "ignore",
|
|
35
|
+
});
|
|
32
36
|
}
|
|
33
37
|
exports.trajectoryUnorderedMatch = trajectoryUnorderedMatch;
|
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
import { BaseMessage } from "@langchain/core/messages";
|
|
2
|
-
import { ChatCompletionMessage, EvaluatorResult } from "../types.js";
|
|
2
|
+
import { ChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
|
|
3
|
+
export declare const _scorer: (params: {
|
|
4
|
+
outputs: ChatCompletionMessage[];
|
|
5
|
+
referenceOutputs: ChatCompletionMessage[];
|
|
6
|
+
toolArgsMatchMode: ToolArgsMatchMode;
|
|
7
|
+
toolArgsMatchOverrides?: ToolArgsMatchOverrides;
|
|
8
|
+
}) => Promise<boolean>;
|
|
3
9
|
/**
|
|
10
|
+
* @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "unordered"` instead.
|
|
4
11
|
* Evaluate whether an input agent trajectory and called tools contains all the tools used in a reference trajectory.
|
|
5
12
|
* This accounts for some differences in an LLM's reasoning process in a case-by-case basis.
|
|
6
13
|
*
|