agentevals 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,12 @@
1
1
  import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
2
2
  import { _isTrajectorySuperset } from "./utils.js";
3
+ export const _scorer = async (params) => {
4
+ const isUnorderedMatch = (await _isTrajectorySuperset(params.outputs, params.referenceOutputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides)) &&
5
+ (await _isTrajectorySuperset(params.referenceOutputs, params.outputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides));
6
+ return isUnorderedMatch;
7
+ };
3
8
  /**
9
+ * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "unordered"` instead.
4
10
  * Evaluate whether an input agent trajectory and called tools contains all the tools used in a reference trajectory.
5
11
  * This accounts for some differences in an LLM's reasoning process in a case-by-case basis.
6
12
  *
@@ -17,13 +23,10 @@ export async function trajectoryUnorderedMatch(params) {
17
23
  const { outputs, referenceOutputs } = params;
18
24
  const outputsList = _normalizeToOpenAIMessagesList(outputs);
19
25
  const referenceOutputsList = _normalizeToOpenAIMessagesList(referenceOutputs);
20
- const getScore = async () => {
21
- if (outputsList == null || referenceOutputsList == null) {
22
- throw new Error("Trajectory unordered match requires both outputs and reference_outputs");
23
- }
24
- const unorderedMatch = _isTrajectorySuperset(outputsList, referenceOutputsList) &&
25
- _isTrajectorySuperset(referenceOutputsList, outputsList);
26
- return unorderedMatch;
27
- };
28
- return _runEvaluator("trajectory_unordered_match", getScore, "trajectory_unordered_match", params);
26
+ return _runEvaluator("trajectory_unordered_match", _scorer, "trajectory_unordered_match", {
27
+ ...params,
28
+ outputs: outputsList,
29
+ referenceOutputs: referenceOutputsList,
30
+ toolArgsMatchMode: "ignore",
31
+ });
29
32
  }
@@ -1,14 +1,15 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports._chatCompletionMessagesToString = exports._isTrajectorySuperset = void 0;
3
+ exports._chatCompletionMessagesToString = exports._getMatcherForToolName = exports._isTrajectorySuperset = void 0;
4
4
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
5
5
  function _normalizeToolCall(toolCall) {
6
6
  if ("function" in toolCall &&
7
7
  toolCall.function != null &&
8
- typeof toolCall.function === "object") {
8
+ typeof toolCall.function === "object" &&
9
+ typeof toolCall.function.arguments === "string") {
9
10
  return {
10
11
  name: toolCall.function.name,
11
- args: toolCall.function.arguments,
12
+ args: JSON.parse(toolCall.function.arguments),
12
13
  };
13
14
  }
14
15
  return toolCall;
@@ -22,29 +23,117 @@ function _extractToolCalls(messages) {
22
23
  }
23
24
  return toolCalls;
24
25
  }
25
- function _isTrajectorySuperset(outputs, referenceOutputs) {
26
+ async function _isTrajectorySuperset(outputs, referenceOutputs, toolArgsMatchMode, toolArgsMatchOverrides) {
26
27
  const outputToolCalls = _extractToolCalls(outputs);
27
28
  const referenceToolCalls = _extractToolCalls(referenceOutputs);
28
- const outputToolCounts = new Map();
29
- const referenceToolCounts = new Map();
30
- for (const call of outputToolCalls) {
31
- outputToolCounts.set(call.name, (outputToolCounts.get(call.name) ?? 0) + 1);
32
- }
33
- for (const call of referenceToolCalls) {
34
- referenceToolCounts.set(call.name, (referenceToolCounts.get(call.name) ?? 0) + 1);
35
- }
36
- const allTools = new Set([
37
- ...outputToolCounts.keys(),
38
- ...referenceToolCounts.keys(),
39
- ]);
40
- for (const name of allTools) {
41
- if ((outputToolCounts.get(name) ?? 0) < (referenceToolCounts.get(name) ?? 0)) {
29
+ // Keep track of which reference tool calls have been matched
30
+ const matchedReferenceCalls = new Set();
31
+ // For each reference tool call, find a matching output tool call
32
+ for (const refCall of referenceToolCalls) {
33
+ const refName = refCall.name;
34
+ const refArgs = refCall.args;
35
+ let foundMatch = false;
36
+ for (let outIdx = 0; outIdx < outputToolCalls.length; outIdx++) {
37
+ const outCall = outputToolCalls[outIdx];
38
+ const outName = outCall.name;
39
+ // Names must match
40
+ if (refName !== outName) {
41
+ continue;
42
+ }
43
+ // If we're already using this output call for a different match, skip
44
+ if (matchedReferenceCalls.has(outIdx)) {
45
+ continue;
46
+ }
47
+ // Check tool args according to match mode
48
+ const matcher = _getMatcherForToolName(refName, toolArgsMatchMode, toolArgsMatchOverrides);
49
+ const outArgs = outCall.args;
50
+ if (await matcher(outArgs, refArgs)) {
51
+ matchedReferenceCalls.add(outIdx);
52
+ foundMatch = true;
53
+ break;
54
+ }
55
+ }
56
+ // If we didn't find a match for this reference call, we're not a superset
57
+ if (!foundMatch) {
42
58
  return false;
43
59
  }
44
60
  }
45
61
  return true;
46
62
  }
47
63
  exports._isTrajectorySuperset = _isTrajectorySuperset;
64
+ // Deep equality check function
65
+ function _deepEqual(a, b) {
66
+ if (a == null && b == null)
67
+ return true;
68
+ if (a === b)
69
+ return true;
70
+ if (typeof a !== "object" || typeof b !== "object" || !a || !b)
71
+ return false;
72
+ if (Array.isArray(a) && Array.isArray(b)) {
73
+ if (a.length !== b.length)
74
+ return false;
75
+ return a.every((val, index) => _deepEqual(val, b[index]));
76
+ }
77
+ const keysA = Object.keys(a);
78
+ const keysB = Object.keys(b);
79
+ if (keysA.length !== keysB.length)
80
+ return false;
81
+ return (keysA.every((key) => keysB.includes(key)) &&
82
+ keysB.every((key) => keysA.includes(key)) &&
83
+ keysA.every((key) => _deepEqual(a[key], b[key])));
84
+ }
85
+ function _exactMatch(toolCall, referenceToolCall) {
86
+ return _deepEqual(toolCall, referenceToolCall);
87
+ }
88
+ function _ignoreMatch(_toolCall, _referenceToolCall) {
89
+ return true;
90
+ }
91
+ function _getMatcherForComparisonMode(mode) {
92
+ if (mode === "exact") {
93
+ return _exactMatch;
94
+ }
95
+ else {
96
+ return _ignoreMatch;
97
+ }
98
+ }
99
+ function _getPartialMatcherOnKeys(keys) {
100
+ const getNestedValue = (d, keyPath) => {
101
+ let current = d;
102
+ for (const part of keyPath.split(".")) {
103
+ if (current && typeof current === "object" && part in current) {
104
+ current = current[part];
105
+ }
106
+ else {
107
+ return undefined;
108
+ }
109
+ }
110
+ return current;
111
+ };
112
+ return (outputCall, referenceCall) => {
113
+ return keys.every((key) => {
114
+ const nestedOutputValue = getNestedValue(outputCall, key);
115
+ const nestedReferenceValue = getNestedValue(referenceCall, key);
116
+ return _deepEqual(nestedOutputValue, nestedReferenceValue);
117
+ });
118
+ };
119
+ }
120
+ function _getMatcherForToolName(toolCallName, toolArgsMatchMode, toolArgsMatchOverrides) {
121
+ let matcher = _getMatcherForComparisonMode(toolArgsMatchMode);
122
+ if (toolArgsMatchOverrides && toolCallName in toolArgsMatchOverrides) {
123
+ const override = toolArgsMatchOverrides[toolCallName];
124
+ if (typeof override === "string") {
125
+ matcher = _getMatcherForComparisonMode(override);
126
+ }
127
+ else if (typeof override === "function") {
128
+ matcher = override;
129
+ }
130
+ else if (Array.isArray(override)) {
131
+ matcher = _getPartialMatcherOnKeys(override);
132
+ }
133
+ }
134
+ return matcher;
135
+ }
136
+ exports._getMatcherForToolName = _getMatcherForToolName;
48
137
  function _chatCompletionMessagesToString(messages) {
49
138
  function formatMessage(message) {
50
139
  let content = message.content ?? "";
@@ -1,3 +1,4 @@
1
- import { ChatCompletionMessage } from "../types.js";
2
- export declare function _isTrajectorySuperset(outputs: ChatCompletionMessage[], referenceOutputs: ChatCompletionMessage[]): boolean;
1
+ import { ChatCompletionMessage, ToolArgsMatchMode, ToolArgsMatchOverrides, ToolArgsMatcher } from "../types.js";
2
+ export declare function _isTrajectorySuperset(outputs: ChatCompletionMessage[], referenceOutputs: ChatCompletionMessage[], toolArgsMatchMode: ToolArgsMatchMode, toolArgsMatchOverrides?: ToolArgsMatchOverrides): Promise<boolean>;
3
+ export declare function _getMatcherForToolName(toolCallName: string, toolArgsMatchMode: ToolArgsMatchMode, toolArgsMatchOverrides?: ToolArgsMatchOverrides): ToolArgsMatcher;
3
4
  export declare function _chatCompletionMessagesToString(messages: ChatCompletionMessage[]): string;
@@ -2,10 +2,11 @@
2
2
  function _normalizeToolCall(toolCall) {
3
3
  if ("function" in toolCall &&
4
4
  toolCall.function != null &&
5
- typeof toolCall.function === "object") {
5
+ typeof toolCall.function === "object" &&
6
+ typeof toolCall.function.arguments === "string") {
6
7
  return {
7
8
  name: toolCall.function.name,
8
- args: toolCall.function.arguments,
9
+ args: JSON.parse(toolCall.function.arguments),
9
10
  };
10
11
  }
11
12
  return toolCall;
@@ -19,28 +20,115 @@ function _extractToolCalls(messages) {
19
20
  }
20
21
  return toolCalls;
21
22
  }
22
- export function _isTrajectorySuperset(outputs, referenceOutputs) {
23
+ export async function _isTrajectorySuperset(outputs, referenceOutputs, toolArgsMatchMode, toolArgsMatchOverrides) {
23
24
  const outputToolCalls = _extractToolCalls(outputs);
24
25
  const referenceToolCalls = _extractToolCalls(referenceOutputs);
25
- const outputToolCounts = new Map();
26
- const referenceToolCounts = new Map();
27
- for (const call of outputToolCalls) {
28
- outputToolCounts.set(call.name, (outputToolCounts.get(call.name) ?? 0) + 1);
29
- }
30
- for (const call of referenceToolCalls) {
31
- referenceToolCounts.set(call.name, (referenceToolCounts.get(call.name) ?? 0) + 1);
32
- }
33
- const allTools = new Set([
34
- ...outputToolCounts.keys(),
35
- ...referenceToolCounts.keys(),
36
- ]);
37
- for (const name of allTools) {
38
- if ((outputToolCounts.get(name) ?? 0) < (referenceToolCounts.get(name) ?? 0)) {
26
+ // Keep track of which reference tool calls have been matched
27
+ const matchedReferenceCalls = new Set();
28
+ // For each reference tool call, find a matching output tool call
29
+ for (const refCall of referenceToolCalls) {
30
+ const refName = refCall.name;
31
+ const refArgs = refCall.args;
32
+ let foundMatch = false;
33
+ for (let outIdx = 0; outIdx < outputToolCalls.length; outIdx++) {
34
+ const outCall = outputToolCalls[outIdx];
35
+ const outName = outCall.name;
36
+ // Names must match
37
+ if (refName !== outName) {
38
+ continue;
39
+ }
40
+ // If we're already using this output call for a different match, skip
41
+ if (matchedReferenceCalls.has(outIdx)) {
42
+ continue;
43
+ }
44
+ // Check tool args according to match mode
45
+ const matcher = _getMatcherForToolName(refName, toolArgsMatchMode, toolArgsMatchOverrides);
46
+ const outArgs = outCall.args;
47
+ if (await matcher(outArgs, refArgs)) {
48
+ matchedReferenceCalls.add(outIdx);
49
+ foundMatch = true;
50
+ break;
51
+ }
52
+ }
53
+ // If we didn't find a match for this reference call, we're not a superset
54
+ if (!foundMatch) {
39
55
  return false;
40
56
  }
41
57
  }
42
58
  return true;
43
59
  }
60
+ // Deep equality check function
61
+ function _deepEqual(a, b) {
62
+ if (a == null && b == null)
63
+ return true;
64
+ if (a === b)
65
+ return true;
66
+ if (typeof a !== "object" || typeof b !== "object" || !a || !b)
67
+ return false;
68
+ if (Array.isArray(a) && Array.isArray(b)) {
69
+ if (a.length !== b.length)
70
+ return false;
71
+ return a.every((val, index) => _deepEqual(val, b[index]));
72
+ }
73
+ const keysA = Object.keys(a);
74
+ const keysB = Object.keys(b);
75
+ if (keysA.length !== keysB.length)
76
+ return false;
77
+ return (keysA.every((key) => keysB.includes(key)) &&
78
+ keysB.every((key) => keysA.includes(key)) &&
79
+ keysA.every((key) => _deepEqual(a[key], b[key])));
80
+ }
81
+ function _exactMatch(toolCall, referenceToolCall) {
82
+ return _deepEqual(toolCall, referenceToolCall);
83
+ }
84
+ function _ignoreMatch(_toolCall, _referenceToolCall) {
85
+ return true;
86
+ }
87
+ function _getMatcherForComparisonMode(mode) {
88
+ if (mode === "exact") {
89
+ return _exactMatch;
90
+ }
91
+ else {
92
+ return _ignoreMatch;
93
+ }
94
+ }
95
+ function _getPartialMatcherOnKeys(keys) {
96
+ const getNestedValue = (d, keyPath) => {
97
+ let current = d;
98
+ for (const part of keyPath.split(".")) {
99
+ if (current && typeof current === "object" && part in current) {
100
+ current = current[part];
101
+ }
102
+ else {
103
+ return undefined;
104
+ }
105
+ }
106
+ return current;
107
+ };
108
+ return (outputCall, referenceCall) => {
109
+ return keys.every((key) => {
110
+ const nestedOutputValue = getNestedValue(outputCall, key);
111
+ const nestedReferenceValue = getNestedValue(referenceCall, key);
112
+ return _deepEqual(nestedOutputValue, nestedReferenceValue);
113
+ });
114
+ };
115
+ }
116
+ export function _getMatcherForToolName(toolCallName, toolArgsMatchMode, toolArgsMatchOverrides) {
117
+ let matcher = _getMatcherForComparisonMode(toolArgsMatchMode);
118
+ if (toolArgsMatchOverrides && toolCallName in toolArgsMatchOverrides) {
119
+ const override = toolArgsMatchOverrides[toolCallName];
120
+ if (typeof override === "string") {
121
+ matcher = _getMatcherForComparisonMode(override);
122
+ }
123
+ else if (typeof override === "function") {
124
+ matcher = override;
125
+ }
126
+ else if (Array.isArray(override)) {
127
+ matcher = _getPartialMatcherOnKeys(override);
128
+ }
129
+ }
130
+ return matcher;
131
+ }
44
132
  export function _chatCompletionMessagesToString(messages) {
45
133
  function formatMessage(message) {
46
134
  let content = message.content ?? "";
package/dist/types.d.ts CHANGED
@@ -12,3 +12,6 @@ export type ExtractedLangGraphThreadTrajectory = {
12
12
  export type TrajectoryLLMAsJudgeParams = Omit<Parameters<typeof createLLMAsJudge>[0], "prompt"> & {
13
13
  prompt?: string;
14
14
  };
15
+ export type ToolArgsMatchMode = "exact" | "ignore";
16
+ export type ToolArgsMatcher = (toolCall: Record<string, unknown>, referenceToolCall: Record<string, unknown>) => boolean | Promise<boolean>;
17
+ export type ToolArgsMatchOverrides = Record<string, ToolArgsMatchMode | string[] | ToolArgsMatcher>;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentevals",
3
- "version": "0.0.2",
3
+ "version": "0.0.4",
4
4
  "packageManager": "yarn@3.5.1",
5
5
  "type": "module",
6
6
  "scripts": {