agentevals 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +141 -36
- package/dist/index.cjs +3 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.js +1 -0
- package/dist/trajectory/llm.cjs +18 -29
- package/dist/trajectory/llm.d.ts +2 -3
- package/dist/trajectory/llm.js +18 -29
- package/dist/trajectory/match.cjs +84 -0
- package/dist/trajectory/match.d.ts +61 -0
- package/dist/trajectory/match.js +80 -0
- package/dist/trajectory/strict.cjs +42 -42
- package/dist/trajectory/strict.d.ts +23 -2
- package/dist/trajectory/strict.js +40 -41
- package/dist/trajectory/subset.cjs +13 -9
- package/dist/trajectory/subset.d.ts +8 -1
- package/dist/trajectory/subset.js +11 -8
- package/dist/trajectory/superset.cjs +13 -9
- package/dist/trajectory/superset.d.ts +8 -1
- package/dist/trajectory/superset.js +11 -8
- package/dist/trajectory/unordered.cjs +14 -10
- package/dist/trajectory/unordered.d.ts +8 -1
- package/dist/trajectory/unordered.js +12 -9
- package/dist/trajectory/utils.cjs +107 -18
- package/dist/trajectory/utils.d.ts +3 -2
- package/dist/trajectory/utils.js +105 -17
- package/dist/types.d.ts +3 -0
- package/package.json +1 -1
|
@@ -1,6 +1,12 @@
|
|
|
1
1
|
import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
|
|
2
2
|
import { _isTrajectorySuperset } from "./utils.js";
|
|
3
|
+
export const _scorer = async (params) => {
|
|
4
|
+
const isUnorderedMatch = (await _isTrajectorySuperset(params.outputs, params.referenceOutputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides)) &&
|
|
5
|
+
(await _isTrajectorySuperset(params.referenceOutputs, params.outputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides));
|
|
6
|
+
return isUnorderedMatch;
|
|
7
|
+
};
|
|
3
8
|
/**
|
|
9
|
+
* @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "unordered"` instead.
|
|
4
10
|
* Evaluate whether an input agent trajectory and called tools contains all the tools used in a reference trajectory.
|
|
5
11
|
* This accounts for some differences in an LLM's reasoning process in a case-by-case basis.
|
|
6
12
|
*
|
|
@@ -17,13 +23,10 @@ export async function trajectoryUnorderedMatch(params) {
|
|
|
17
23
|
const { outputs, referenceOutputs } = params;
|
|
18
24
|
const outputsList = _normalizeToOpenAIMessagesList(outputs);
|
|
19
25
|
const referenceOutputsList = _normalizeToOpenAIMessagesList(referenceOutputs);
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
return unorderedMatch;
|
|
27
|
-
};
|
|
28
|
-
return _runEvaluator("trajectory_unordered_match", getScore, "trajectory_unordered_match", params);
|
|
26
|
+
return _runEvaluator("trajectory_unordered_match", _scorer, "trajectory_unordered_match", {
|
|
27
|
+
...params,
|
|
28
|
+
outputs: outputsList,
|
|
29
|
+
referenceOutputs: referenceOutputsList,
|
|
30
|
+
toolArgsMatchMode: "ignore",
|
|
31
|
+
});
|
|
29
32
|
}
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports._chatCompletionMessagesToString = exports._isTrajectorySuperset = void 0;
|
|
3
|
+
exports._chatCompletionMessagesToString = exports._getMatcherForToolName = exports._isTrajectorySuperset = void 0;
|
|
4
4
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
5
5
|
function _normalizeToolCall(toolCall) {
|
|
6
6
|
if ("function" in toolCall &&
|
|
7
7
|
toolCall.function != null &&
|
|
8
|
-
typeof toolCall.function === "object"
|
|
8
|
+
typeof toolCall.function === "object" &&
|
|
9
|
+
typeof toolCall.function.arguments === "string") {
|
|
9
10
|
return {
|
|
10
11
|
name: toolCall.function.name,
|
|
11
|
-
args: toolCall.function.arguments,
|
|
12
|
+
args: JSON.parse(toolCall.function.arguments),
|
|
12
13
|
};
|
|
13
14
|
}
|
|
14
15
|
return toolCall;
|
|
@@ -22,29 +23,117 @@ function _extractToolCalls(messages) {
|
|
|
22
23
|
}
|
|
23
24
|
return toolCalls;
|
|
24
25
|
}
|
|
25
|
-
function _isTrajectorySuperset(outputs, referenceOutputs) {
|
|
26
|
+
async function _isTrajectorySuperset(outputs, referenceOutputs, toolArgsMatchMode, toolArgsMatchOverrides) {
|
|
26
27
|
const outputToolCalls = _extractToolCalls(outputs);
|
|
27
28
|
const referenceToolCalls = _extractToolCalls(referenceOutputs);
|
|
28
|
-
|
|
29
|
-
const
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
29
|
+
// Keep track of which reference tool calls have been matched
|
|
30
|
+
const matchedReferenceCalls = new Set();
|
|
31
|
+
// For each reference tool call, find a matching output tool call
|
|
32
|
+
for (const refCall of referenceToolCalls) {
|
|
33
|
+
const refName = refCall.name;
|
|
34
|
+
const refArgs = refCall.args;
|
|
35
|
+
let foundMatch = false;
|
|
36
|
+
for (let outIdx = 0; outIdx < outputToolCalls.length; outIdx++) {
|
|
37
|
+
const outCall = outputToolCalls[outIdx];
|
|
38
|
+
const outName = outCall.name;
|
|
39
|
+
// Names must match
|
|
40
|
+
if (refName !== outName) {
|
|
41
|
+
continue;
|
|
42
|
+
}
|
|
43
|
+
// If we're already using this output call for a different match, skip
|
|
44
|
+
if (matchedReferenceCalls.has(outIdx)) {
|
|
45
|
+
continue;
|
|
46
|
+
}
|
|
47
|
+
// Check tool args according to match mode
|
|
48
|
+
const matcher = _getMatcherForToolName(refName, toolArgsMatchMode, toolArgsMatchOverrides);
|
|
49
|
+
const outArgs = outCall.args;
|
|
50
|
+
if (await matcher(outArgs, refArgs)) {
|
|
51
|
+
matchedReferenceCalls.add(outIdx);
|
|
52
|
+
foundMatch = true;
|
|
53
|
+
break;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
// If we didn't find a match for this reference call, we're not a superset
|
|
57
|
+
if (!foundMatch) {
|
|
42
58
|
return false;
|
|
43
59
|
}
|
|
44
60
|
}
|
|
45
61
|
return true;
|
|
46
62
|
}
|
|
47
63
|
exports._isTrajectorySuperset = _isTrajectorySuperset;
|
|
64
|
+
// Deep equality check function
|
|
65
|
+
function _deepEqual(a, b) {
|
|
66
|
+
if (a == null && b == null)
|
|
67
|
+
return true;
|
|
68
|
+
if (a === b)
|
|
69
|
+
return true;
|
|
70
|
+
if (typeof a !== "object" || typeof b !== "object" || !a || !b)
|
|
71
|
+
return false;
|
|
72
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
73
|
+
if (a.length !== b.length)
|
|
74
|
+
return false;
|
|
75
|
+
return a.every((val, index) => _deepEqual(val, b[index]));
|
|
76
|
+
}
|
|
77
|
+
const keysA = Object.keys(a);
|
|
78
|
+
const keysB = Object.keys(b);
|
|
79
|
+
if (keysA.length !== keysB.length)
|
|
80
|
+
return false;
|
|
81
|
+
return (keysA.every((key) => keysB.includes(key)) &&
|
|
82
|
+
keysB.every((key) => keysA.includes(key)) &&
|
|
83
|
+
keysA.every((key) => _deepEqual(a[key], b[key])));
|
|
84
|
+
}
|
|
85
|
+
function _exactMatch(toolCall, referenceToolCall) {
|
|
86
|
+
return _deepEqual(toolCall, referenceToolCall);
|
|
87
|
+
}
|
|
88
|
+
function _ignoreMatch(_toolCall, _referenceToolCall) {
|
|
89
|
+
return true;
|
|
90
|
+
}
|
|
91
|
+
function _getMatcherForComparisonMode(mode) {
|
|
92
|
+
if (mode === "exact") {
|
|
93
|
+
return _exactMatch;
|
|
94
|
+
}
|
|
95
|
+
else {
|
|
96
|
+
return _ignoreMatch;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
function _getPartialMatcherOnKeys(keys) {
|
|
100
|
+
const getNestedValue = (d, keyPath) => {
|
|
101
|
+
let current = d;
|
|
102
|
+
for (const part of keyPath.split(".")) {
|
|
103
|
+
if (current && typeof current === "object" && part in current) {
|
|
104
|
+
current = current[part];
|
|
105
|
+
}
|
|
106
|
+
else {
|
|
107
|
+
return undefined;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
return current;
|
|
111
|
+
};
|
|
112
|
+
return (outputCall, referenceCall) => {
|
|
113
|
+
return keys.every((key) => {
|
|
114
|
+
const nestedOutputValue = getNestedValue(outputCall, key);
|
|
115
|
+
const nestedReferenceValue = getNestedValue(referenceCall, key);
|
|
116
|
+
return _deepEqual(nestedOutputValue, nestedReferenceValue);
|
|
117
|
+
});
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
function _getMatcherForToolName(toolCallName, toolArgsMatchMode, toolArgsMatchOverrides) {
|
|
121
|
+
let matcher = _getMatcherForComparisonMode(toolArgsMatchMode);
|
|
122
|
+
if (toolArgsMatchOverrides && toolCallName in toolArgsMatchOverrides) {
|
|
123
|
+
const override = toolArgsMatchOverrides[toolCallName];
|
|
124
|
+
if (typeof override === "string") {
|
|
125
|
+
matcher = _getMatcherForComparisonMode(override);
|
|
126
|
+
}
|
|
127
|
+
else if (typeof override === "function") {
|
|
128
|
+
matcher = override;
|
|
129
|
+
}
|
|
130
|
+
else if (Array.isArray(override)) {
|
|
131
|
+
matcher = _getPartialMatcherOnKeys(override);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
return matcher;
|
|
135
|
+
}
|
|
136
|
+
exports._getMatcherForToolName = _getMatcherForToolName;
|
|
48
137
|
function _chatCompletionMessagesToString(messages) {
|
|
49
138
|
function formatMessage(message) {
|
|
50
139
|
let content = message.content ?? "";
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
-
import { ChatCompletionMessage } from "../types.js";
|
|
2
|
-
export declare function _isTrajectorySuperset(outputs: ChatCompletionMessage[], referenceOutputs: ChatCompletionMessage[]): boolean
|
|
1
|
+
import { ChatCompletionMessage, ToolArgsMatchMode, ToolArgsMatchOverrides, ToolArgsMatcher } from "../types.js";
|
|
2
|
+
export declare function _isTrajectorySuperset(outputs: ChatCompletionMessage[], referenceOutputs: ChatCompletionMessage[], toolArgsMatchMode: ToolArgsMatchMode, toolArgsMatchOverrides?: ToolArgsMatchOverrides): Promise<boolean>;
|
|
3
|
+
export declare function _getMatcherForToolName(toolCallName: string, toolArgsMatchMode: ToolArgsMatchMode, toolArgsMatchOverrides?: ToolArgsMatchOverrides): ToolArgsMatcher;
|
|
3
4
|
export declare function _chatCompletionMessagesToString(messages: ChatCompletionMessage[]): string;
|
package/dist/trajectory/utils.js
CHANGED
|
@@ -2,10 +2,11 @@
|
|
|
2
2
|
function _normalizeToolCall(toolCall) {
|
|
3
3
|
if ("function" in toolCall &&
|
|
4
4
|
toolCall.function != null &&
|
|
5
|
-
typeof toolCall.function === "object"
|
|
5
|
+
typeof toolCall.function === "object" &&
|
|
6
|
+
typeof toolCall.function.arguments === "string") {
|
|
6
7
|
return {
|
|
7
8
|
name: toolCall.function.name,
|
|
8
|
-
args: toolCall.function.arguments,
|
|
9
|
+
args: JSON.parse(toolCall.function.arguments),
|
|
9
10
|
};
|
|
10
11
|
}
|
|
11
12
|
return toolCall;
|
|
@@ -19,28 +20,115 @@ function _extractToolCalls(messages) {
|
|
|
19
20
|
}
|
|
20
21
|
return toolCalls;
|
|
21
22
|
}
|
|
22
|
-
export function _isTrajectorySuperset(outputs, referenceOutputs) {
|
|
23
|
+
export async function _isTrajectorySuperset(outputs, referenceOutputs, toolArgsMatchMode, toolArgsMatchOverrides) {
|
|
23
24
|
const outputToolCalls = _extractToolCalls(outputs);
|
|
24
25
|
const referenceToolCalls = _extractToolCalls(referenceOutputs);
|
|
25
|
-
|
|
26
|
-
const
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
26
|
+
// Keep track of which reference tool calls have been matched
|
|
27
|
+
const matchedReferenceCalls = new Set();
|
|
28
|
+
// For each reference tool call, find a matching output tool call
|
|
29
|
+
for (const refCall of referenceToolCalls) {
|
|
30
|
+
const refName = refCall.name;
|
|
31
|
+
const refArgs = refCall.args;
|
|
32
|
+
let foundMatch = false;
|
|
33
|
+
for (let outIdx = 0; outIdx < outputToolCalls.length; outIdx++) {
|
|
34
|
+
const outCall = outputToolCalls[outIdx];
|
|
35
|
+
const outName = outCall.name;
|
|
36
|
+
// Names must match
|
|
37
|
+
if (refName !== outName) {
|
|
38
|
+
continue;
|
|
39
|
+
}
|
|
40
|
+
// If we're already using this output call for a different match, skip
|
|
41
|
+
if (matchedReferenceCalls.has(outIdx)) {
|
|
42
|
+
continue;
|
|
43
|
+
}
|
|
44
|
+
// Check tool args according to match mode
|
|
45
|
+
const matcher = _getMatcherForToolName(refName, toolArgsMatchMode, toolArgsMatchOverrides);
|
|
46
|
+
const outArgs = outCall.args;
|
|
47
|
+
if (await matcher(outArgs, refArgs)) {
|
|
48
|
+
matchedReferenceCalls.add(outIdx);
|
|
49
|
+
foundMatch = true;
|
|
50
|
+
break;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
// If we didn't find a match for this reference call, we're not a superset
|
|
54
|
+
if (!foundMatch) {
|
|
39
55
|
return false;
|
|
40
56
|
}
|
|
41
57
|
}
|
|
42
58
|
return true;
|
|
43
59
|
}
|
|
60
|
+
// Deep equality check function
|
|
61
|
+
function _deepEqual(a, b) {
|
|
62
|
+
if (a == null && b == null)
|
|
63
|
+
return true;
|
|
64
|
+
if (a === b)
|
|
65
|
+
return true;
|
|
66
|
+
if (typeof a !== "object" || typeof b !== "object" || !a || !b)
|
|
67
|
+
return false;
|
|
68
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
69
|
+
if (a.length !== b.length)
|
|
70
|
+
return false;
|
|
71
|
+
return a.every((val, index) => _deepEqual(val, b[index]));
|
|
72
|
+
}
|
|
73
|
+
const keysA = Object.keys(a);
|
|
74
|
+
const keysB = Object.keys(b);
|
|
75
|
+
if (keysA.length !== keysB.length)
|
|
76
|
+
return false;
|
|
77
|
+
return (keysA.every((key) => keysB.includes(key)) &&
|
|
78
|
+
keysB.every((key) => keysA.includes(key)) &&
|
|
79
|
+
keysA.every((key) => _deepEqual(a[key], b[key])));
|
|
80
|
+
}
|
|
81
|
+
function _exactMatch(toolCall, referenceToolCall) {
|
|
82
|
+
return _deepEqual(toolCall, referenceToolCall);
|
|
83
|
+
}
|
|
84
|
+
function _ignoreMatch(_toolCall, _referenceToolCall) {
|
|
85
|
+
return true;
|
|
86
|
+
}
|
|
87
|
+
function _getMatcherForComparisonMode(mode) {
|
|
88
|
+
if (mode === "exact") {
|
|
89
|
+
return _exactMatch;
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
return _ignoreMatch;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
function _getPartialMatcherOnKeys(keys) {
|
|
96
|
+
const getNestedValue = (d, keyPath) => {
|
|
97
|
+
let current = d;
|
|
98
|
+
for (const part of keyPath.split(".")) {
|
|
99
|
+
if (current && typeof current === "object" && part in current) {
|
|
100
|
+
current = current[part];
|
|
101
|
+
}
|
|
102
|
+
else {
|
|
103
|
+
return undefined;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
return current;
|
|
107
|
+
};
|
|
108
|
+
return (outputCall, referenceCall) => {
|
|
109
|
+
return keys.every((key) => {
|
|
110
|
+
const nestedOutputValue = getNestedValue(outputCall, key);
|
|
111
|
+
const nestedReferenceValue = getNestedValue(referenceCall, key);
|
|
112
|
+
return _deepEqual(nestedOutputValue, nestedReferenceValue);
|
|
113
|
+
});
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
export function _getMatcherForToolName(toolCallName, toolArgsMatchMode, toolArgsMatchOverrides) {
|
|
117
|
+
let matcher = _getMatcherForComparisonMode(toolArgsMatchMode);
|
|
118
|
+
if (toolArgsMatchOverrides && toolCallName in toolArgsMatchOverrides) {
|
|
119
|
+
const override = toolArgsMatchOverrides[toolCallName];
|
|
120
|
+
if (typeof override === "string") {
|
|
121
|
+
matcher = _getMatcherForComparisonMode(override);
|
|
122
|
+
}
|
|
123
|
+
else if (typeof override === "function") {
|
|
124
|
+
matcher = override;
|
|
125
|
+
}
|
|
126
|
+
else if (Array.isArray(override)) {
|
|
127
|
+
matcher = _getPartialMatcherOnKeys(override);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
return matcher;
|
|
131
|
+
}
|
|
44
132
|
export function _chatCompletionMessagesToString(messages) {
|
|
45
133
|
function formatMessage(message) {
|
|
46
134
|
let content = message.content ?? "";
|
package/dist/types.d.ts
CHANGED
|
@@ -12,3 +12,6 @@ export type ExtractedLangGraphThreadTrajectory = {
|
|
|
12
12
|
export type TrajectoryLLMAsJudgeParams = Omit<Parameters<typeof createLLMAsJudge>[0], "prompt"> & {
|
|
13
13
|
prompt?: string;
|
|
14
14
|
};
|
|
15
|
+
export type ToolArgsMatchMode = "exact" | "ignore";
|
|
16
|
+
export type ToolArgsMatcher = (toolCall: Record<string, unknown>, referenceToolCall: Record<string, unknown>) => boolean | Promise<boolean>;
|
|
17
|
+
export type ToolArgsMatchOverrides = Record<string, ToolArgsMatchMode | string[] | ToolArgsMatcher>;
|