agentevals 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,61 @@
1
+ import { BaseMessage } from "@langchain/core/messages";
2
+ import { ChatCompletionMessage, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
3
+ export type TrajectoryMatchMode = "strict" | "unordered" | "subset" | "superset";
4
+ /**
5
+ * Creates an evaluator that compares trajectories between model outputs and reference outputs.
6
+ *
7
+ * @param options - The configuration options
8
+ * @param options.trajectoryMatchMode - The mode for matching trajectories:
9
+ * - `"strict"`: Requires exact match in order and content
10
+ * - `"unordered"`: Allows matching in any order
11
+ * - `"subset"`: Accepts if output trajectory is a subset of reference
12
+ * - `"superset"`: Accepts if output trajectory is a superset of reference
13
+ * @param options.toolArgsMatchMode - Mode for matching tool arguments ("exact" by default, can be "ignore")
14
+ * @param options.toolArgsMatchOverrides - Object containing custom overrides for tool argument matching.
15
+ * Each key should be a tool name, and each value should be either a match mode or a matcher function.
16
+ * Matchers should be a function that takes two sets of tool call args and returns whether they are equal.
17
+ *
18
+ * @returns An async function that evaluates trajectory matches between outputs and references.
19
+ * The returned evaluator accepts:
20
+ * - outputs: List of messages or dict representing the model output trajectory
21
+ * - referenceOutputs: List of messages or dict representing the reference trajectory
22
+ * - Additional arguments passed to the underlying evaluator
23
+ *
24
+ * @example
25
+ * ```typescript
26
+ * const matcher = (
27
+ * outputToolCallArgs: Record<string, any>,
28
+ * referenceToolCallArgs: Record<string, any>
29
+ * ): boolean => {
30
+ * const outputArgs = (outputToolCallArgs.query ?? "").toLowerCase();
31
+ * const referenceArgs = (referenceToolCallArgs.query ?? "").toLowerCase();
32
+ * return outputArgs === referenceArgs;
33
+ * };
34
+ *
35
+ * const evaluator = createAsyncTrajectoryMatchEvaluator({
36
+ * trajectoryMatchMode: "strict",
37
+ * toolArgsMatchMode: "exact",
38
+ * toolArgsMatchOverrides: {
39
+ * myToolName: matcher,
40
+ * },
41
+ * });
42
+ *
43
+ * const result = await evaluator({
44
+ * outputs: [...],
45
+ * referenceOutputs: [...],
46
+ * });
47
+ * ```
48
+ */
49
+ export declare function createTrajectoryMatchEvaluator({ trajectoryMatchMode, toolArgsMatchMode, toolArgsMatchOverrides, }: {
50
+ trajectoryMatchMode?: TrajectoryMatchMode;
51
+ toolArgsMatchMode?: ToolArgsMatchMode;
52
+ toolArgsMatchOverrides?: ToolArgsMatchOverrides;
53
+ }): ({ outputs, referenceOutputs, ...extra }: {
54
+ [key: string]: unknown;
55
+ outputs: ChatCompletionMessage[] | BaseMessage[] | {
56
+ messages: (BaseMessage | ChatCompletionMessage)[];
57
+ };
58
+ referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
59
+ messages: (BaseMessage | ChatCompletionMessage)[];
60
+ };
61
+ }) => Promise<import("langsmith/vitest").SimpleEvaluationResult>;
@@ -0,0 +1,80 @@
1
+ import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
2
+ import { _scorer as trajectoryStrictScorer } from "./strict.js";
3
+ import { _scorer as trajectoryUnorderedScorer } from "./unordered.js";
4
+ import { _scorer as trajectorySubsetScorer } from "./subset.js";
5
+ import { _scorer as trajectorySuperstScorer } from "./superset.js";
6
+ /**
7
+ * Creates an evaluator that compares trajectories between model outputs and reference outputs.
8
+ *
9
+ * @param options - The configuration options
10
+ * @param options.trajectoryMatchMode - The mode for matching trajectories:
11
+ * - `"strict"`: Requires exact match in order and content
12
+ * - `"unordered"`: Allows matching in any order
13
+ * - `"subset"`: Accepts if output trajectory is a subset of reference
14
+ * - `"superset"`: Accepts if output trajectory is a superset of reference
15
+ * @param options.toolArgsMatchMode - Mode for matching tool arguments ("exact" by default, can be "ignore")
16
+ * @param options.toolArgsMatchOverrides - Object containing custom overrides for tool argument matching.
17
+ * Each key should be a tool name, and each value should be either a match mode or a matcher function.
18
+ * Matchers should be a function that takes two sets of tool call args and returns whether they are equal.
19
+ *
20
+ * @returns An async function that evaluates trajectory matches between outputs and references.
21
+ * The returned evaluator accepts:
22
+ * - outputs: List of messages or dict representing the model output trajectory
23
+ * - referenceOutputs: List of messages or dict representing the reference trajectory
24
+ * - Additional arguments passed to the underlying evaluator
25
+ *
26
+ * @example
27
+ * ```typescript
28
+ * const matcher = (
29
+ * outputToolCallArgs: Record<string, any>,
30
+ * referenceToolCallArgs: Record<string, any>
31
+ * ): boolean => {
32
+ * const outputArgs = (outputToolCallArgs.query ?? "").toLowerCase();
33
+ * const referenceArgs = (referenceToolCallArgs.query ?? "").toLowerCase();
34
+ * return outputArgs === referenceArgs;
35
+ * };
36
+ *
37
+ * const evaluator = createAsyncTrajectoryMatchEvaluator({
38
+ * trajectoryMatchMode: "strict",
39
+ * toolArgsMatchMode: "exact",
40
+ * toolArgsMatchOverrides: {
41
+ * myToolName: matcher,
42
+ * },
43
+ * });
44
+ *
45
+ * const result = await evaluator({
46
+ * outputs: [...],
47
+ * referenceOutputs: [...],
48
+ * });
49
+ * ```
50
+ */
51
+ export function createTrajectoryMatchEvaluator({ trajectoryMatchMode = "strict", toolArgsMatchMode = "exact", toolArgsMatchOverrides, }) {
52
+ let scorer;
53
+ switch (trajectoryMatchMode) {
54
+ case "strict":
55
+ scorer = trajectoryStrictScorer;
56
+ break;
57
+ case "unordered":
58
+ scorer = trajectoryUnorderedScorer;
59
+ break;
60
+ case "subset":
61
+ scorer = trajectorySubsetScorer;
62
+ break;
63
+ case "superset":
64
+ scorer = trajectorySuperstScorer;
65
+ break;
66
+ default:
67
+ throw new Error(`Invalid trajectory match type: ${trajectoryMatchMode}`);
68
+ }
69
+ return async function _wrappedEvaluator({ outputs, referenceOutputs, ...extra }) {
70
+ const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs);
71
+ const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList(referenceOutputs);
72
+ return _runEvaluator(`trajectory_${trajectoryMatchMode}_match`, scorer, `trajectory_${trajectoryMatchMode}_match`, {
73
+ outputs: normalizedOutputs,
74
+ referenceOutputs: normalizedReferenceOutputs,
75
+ toolArgsMatchMode,
76
+ toolArgsMatchOverrides,
77
+ ...extra,
78
+ });
79
+ };
80
+ }
@@ -1,9 +1,10 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.trajectoryStrictMatch = void 0;
3
+ exports.trajectoryStrictMatch = exports._scorer = void 0;
4
4
  const utils_js_1 = require("../utils.cjs");
5
- function _scorer(params) {
6
- const { outputs, referenceOutputs, toolCallArgsExactMatch = true, messageContentExactMatch = false, } = params;
5
+ const utils_js_2 = require("./utils.cjs");
6
+ async function _scorer(params) {
7
+ const { outputs, referenceOutputs, toolArgsMatchMode, toolArgsMatchOverrides, } = params;
7
8
  const normalizedOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
8
9
  const normalizedReferenceOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs);
9
10
  if (!normalizedOutputs || !normalizedReferenceOutputs) {
@@ -12,63 +13,62 @@ function _scorer(params) {
12
13
  if (normalizedOutputs.length !== normalizedReferenceOutputs.length) {
13
14
  return false;
14
15
  }
15
- let exactMatch = true;
16
16
  for (let i = 0; i < normalizedOutputs.length; i++) {
17
17
  const output = normalizedOutputs[i];
18
18
  const referenceOutput = normalizedReferenceOutputs[i];
19
19
  if (output.role !== referenceOutput.role) {
20
- exactMatch = false;
21
- break;
20
+ return false;
22
21
  }
23
22
  const outputHasToolCalls = output.tool_calls != null;
24
23
  const referenceHasToolCalls = referenceOutput.tool_calls != null;
25
24
  if (outputHasToolCalls !== referenceHasToolCalls) {
26
- exactMatch = false;
27
- break;
25
+ return false;
28
26
  }
29
27
  if (outputHasToolCalls) {
30
28
  if (output.tool_calls.length !== referenceOutput.tool_calls.length) {
31
- exactMatch = false;
32
- break;
29
+ return false;
33
30
  }
34
- for (let j = 0; j < output.tool_calls.length; j++) {
35
- if (output.tool_calls[j].function.name !==
36
- referenceOutput.tool_calls[j].function.name) {
37
- exactMatch = false;
38
- break;
31
+ const referenceCalls = referenceOutput.tool_calls ?? [];
32
+ const seen = new Array(referenceCalls.length).fill(false);
33
+ for (const outputCall of output.tool_calls ?? []) {
34
+ let foundMatch = false;
35
+ for (let i = 0; i < referenceCalls.length; i++) {
36
+ const referenceCall = referenceCalls[i];
37
+ if (!seen[i] &&
38
+ outputCall.function?.name === referenceCall.function?.name) {
39
+ const matcher = (0, utils_js_2._getMatcherForToolName)(outputCall.function?.name ?? "", toolArgsMatchMode, toolArgsMatchOverrides);
40
+ if (await matcher(JSON.parse(outputCall.function?.arguments ?? "{}"), JSON.parse(referenceCall.function?.arguments ?? "{}"))) {
41
+ foundMatch = true;
42
+ seen[i] = true;
43
+ break;
44
+ }
45
+ }
39
46
  }
40
- if (toolCallArgsExactMatch &&
41
- output.tool_calls[j].function.arguments !==
42
- referenceOutput.tool_calls[j].function.arguments) {
43
- exactMatch = false;
44
- break;
47
+ if (!foundMatch) {
48
+ return false;
45
49
  }
46
50
  }
47
51
  }
48
- if (messageContentExactMatch &&
49
- output.content !== referenceOutput.content) {
50
- exactMatch = false;
51
- break;
52
- }
53
52
  }
54
- return exactMatch;
53
+ return true;
55
54
  }
55
+ exports._scorer = _scorer;
56
+ /**
57
+ * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "strict"` instead.
58
+ * Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory.
59
+ * This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory.
60
+ *
61
+ * @param outputs - Actual trajectory the agent followed. May be a list of OpenAI messages,
62
+ * a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
63
+ * @param referenceOutputs - Ideal reference trajectory the agent should have followed. May be a list of OpenAI messages,
64
+ * a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
65
+ * @param toolCallArgsExactMatch - Whether to require exact matches for tool call arguments
66
+ * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
67
+ */
56
68
  async function trajectoryStrictMatch(params) {
57
- /**
58
- * Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory.
59
- * This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory.
60
- *
61
- * @param outputs - Actual trajectory the agent followed. May be a list of OpenAI messages,
62
- * a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
63
- * @param referenceOutputs - Ideal reference trajectory the agent should have followed. May be a list of OpenAI messages,
64
- * a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
65
- * @param toolCallArgsExactMatch - Whether to require exact matches for tool call arguments
66
- * @param messageContentExactMatch - Whether to require exact matches for message content
67
- * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
68
- */
69
- function _wrapper() {
70
- return _scorer(params);
71
- }
72
- return (0, utils_js_1._runEvaluator)("trajectory_strict_match", _wrapper, "trajectory_strict_match", params);
69
+ return (0, utils_js_1._runEvaluator)("trajectory_strict_match", _scorer, "trajectory_strict_match", {
70
+ ...params,
71
+ toolArgsMatchMode: params.toolCallArgsExactMatch ? "exact" : "ignore",
72
+ });
73
73
  }
74
74
  exports.trajectoryStrictMatch = trajectoryStrictMatch;
@@ -1,5 +1,27 @@
1
1
  import { BaseMessage } from "@langchain/core/messages";
2
- import { ChatCompletionMessage, EvaluatorResult } from "../types.js";
2
+ import { ChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
3
+ export declare function _scorer(params: {
4
+ outputs: ChatCompletionMessage[] | BaseMessage[] | {
5
+ messages: (BaseMessage | ChatCompletionMessage)[];
6
+ };
7
+ referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
8
+ messages: (BaseMessage | ChatCompletionMessage)[];
9
+ };
10
+ toolArgsMatchMode: ToolArgsMatchMode;
11
+ toolArgsMatchOverrides?: ToolArgsMatchOverrides;
12
+ }): Promise<boolean>;
13
+ /**
14
+ * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "strict"` instead.
15
+ * Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory.
16
+ * This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory.
17
+ *
18
+ * @param outputs - Actual trajectory the agent followed. May be a list of OpenAI messages,
19
+ * a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
20
+ * @param referenceOutputs - Ideal reference trajectory the agent should have followed. May be a list of OpenAI messages,
21
+ * a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
22
+ * @param toolCallArgsExactMatch - Whether to require exact matches for tool call arguments
23
+ * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
24
+ */
3
25
  export declare function trajectoryStrictMatch(params: {
4
26
  outputs: ChatCompletionMessage[] | BaseMessage[] | {
5
27
  messages: (BaseMessage | ChatCompletionMessage)[];
@@ -8,5 +30,4 @@ export declare function trajectoryStrictMatch(params: {
8
30
  messages: (BaseMessage | ChatCompletionMessage)[];
9
31
  };
10
32
  toolCallArgsExactMatch: boolean;
11
- messageContentExactMatch: boolean;
12
33
  }): Promise<EvaluatorResult>;
@@ -1,6 +1,7 @@
1
1
  import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
2
- function _scorer(params) {
3
- const { outputs, referenceOutputs, toolCallArgsExactMatch = true, messageContentExactMatch = false, } = params;
2
+ import { _getMatcherForToolName } from "./utils.js";
3
+ export async function _scorer(params) {
4
+ const { outputs, referenceOutputs, toolArgsMatchMode, toolArgsMatchOverrides, } = params;
4
5
  const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs);
5
6
  const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList(referenceOutputs);
6
7
  if (!normalizedOutputs || !normalizedReferenceOutputs) {
@@ -9,62 +10,60 @@ function _scorer(params) {
9
10
  if (normalizedOutputs.length !== normalizedReferenceOutputs.length) {
10
11
  return false;
11
12
  }
12
- let exactMatch = true;
13
13
  for (let i = 0; i < normalizedOutputs.length; i++) {
14
14
  const output = normalizedOutputs[i];
15
15
  const referenceOutput = normalizedReferenceOutputs[i];
16
16
  if (output.role !== referenceOutput.role) {
17
- exactMatch = false;
18
- break;
17
+ return false;
19
18
  }
20
19
  const outputHasToolCalls = output.tool_calls != null;
21
20
  const referenceHasToolCalls = referenceOutput.tool_calls != null;
22
21
  if (outputHasToolCalls !== referenceHasToolCalls) {
23
- exactMatch = false;
24
- break;
22
+ return false;
25
23
  }
26
24
  if (outputHasToolCalls) {
27
25
  if (output.tool_calls.length !== referenceOutput.tool_calls.length) {
28
- exactMatch = false;
29
- break;
26
+ return false;
30
27
  }
31
- for (let j = 0; j < output.tool_calls.length; j++) {
32
- if (output.tool_calls[j].function.name !==
33
- referenceOutput.tool_calls[j].function.name) {
34
- exactMatch = false;
35
- break;
28
+ const referenceCalls = referenceOutput.tool_calls ?? [];
29
+ const seen = new Array(referenceCalls.length).fill(false);
30
+ for (const outputCall of output.tool_calls ?? []) {
31
+ let foundMatch = false;
32
+ for (let i = 0; i < referenceCalls.length; i++) {
33
+ const referenceCall = referenceCalls[i];
34
+ if (!seen[i] &&
35
+ outputCall.function?.name === referenceCall.function?.name) {
36
+ const matcher = _getMatcherForToolName(outputCall.function?.name ?? "", toolArgsMatchMode, toolArgsMatchOverrides);
37
+ if (await matcher(JSON.parse(outputCall.function?.arguments ?? "{}"), JSON.parse(referenceCall.function?.arguments ?? "{}"))) {
38
+ foundMatch = true;
39
+ seen[i] = true;
40
+ break;
41
+ }
42
+ }
36
43
  }
37
- if (toolCallArgsExactMatch &&
38
- output.tool_calls[j].function.arguments !==
39
- referenceOutput.tool_calls[j].function.arguments) {
40
- exactMatch = false;
41
- break;
44
+ if (!foundMatch) {
45
+ return false;
42
46
  }
43
47
  }
44
48
  }
45
- if (messageContentExactMatch &&
46
- output.content !== referenceOutput.content) {
47
- exactMatch = false;
48
- break;
49
- }
50
49
  }
51
- return exactMatch;
50
+ return true;
52
51
  }
52
+ /**
53
+ * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "strict"` instead.
54
+ * Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory.
55
+ * This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory.
56
+ *
57
+ * @param outputs - Actual trajectory the agent followed. May be a list of OpenAI messages,
58
+ * a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
59
+ * @param referenceOutputs - Ideal reference trajectory the agent should have followed. May be a list of OpenAI messages,
60
+ * a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
61
+ * @param toolCallArgsExactMatch - Whether to require exact matches for tool call arguments
62
+ * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
63
+ */
53
64
  export async function trajectoryStrictMatch(params) {
54
- /**
55
- * Evaluate whether an input agent trajectory and called tools strictly matches a reference trajectory.
56
- * This means that at each step, the agent called the same tools in the same order as specified in the reference trajectory.
57
- *
58
- * @param outputs - Actual trajectory the agent followed. May be a list of OpenAI messages,
59
- * a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
60
- * @param referenceOutputs - Ideal reference trajectory the agent should have followed. May be a list of OpenAI messages,
61
- * a list of LangChain messages, or a dictionary containing a "messages" key with one of the above.
62
- * @param toolCallArgsExactMatch - Whether to require exact matches for tool call arguments
63
- * @param messageContentExactMatch - Whether to require exact matches for message content
64
- * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
65
- */
66
- function _wrapper() {
67
- return _scorer(params);
68
- }
69
- return _runEvaluator("trajectory_strict_match", _wrapper, "trajectory_strict_match", params);
65
+ return _runEvaluator("trajectory_strict_match", _scorer, "trajectory_strict_match", {
66
+ ...params,
67
+ toolArgsMatchMode: params.toolCallArgsExactMatch ? "exact" : "ignore",
68
+ });
70
69
  }
@@ -1,9 +1,15 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.trajectorySubset = void 0;
3
+ exports.trajectorySubset = exports._scorer = void 0;
4
4
  const utils_js_1 = require("../utils.cjs");
5
5
  const utils_js_2 = require("./utils.cjs");
6
+ const _scorer = async (params) => {
7
+ const isSubset = await (0, utils_js_2._isTrajectorySuperset)(params.referenceOutputs, params.outputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides);
8
+ return isSubset;
9
+ };
10
+ exports._scorer = _scorer;
6
11
  /**
12
+ * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "subset"` instead.
7
13
  * Evaluate whether an agent trajectory and called tools is a subset of a reference trajectory and called tools.
8
14
  * This means the agent called a subset of the tools specified in the reference trajectory.
9
15
  *
@@ -20,13 +26,11 @@ async function trajectorySubset(params) {
20
26
  const { outputs, referenceOutputs } = params;
21
27
  const outputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
22
28
  const referenceOutputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs);
23
- const getScore = async () => {
24
- if (outputsList == null || referenceOutputsList == null) {
25
- throw new Error("Trajectory subset match requires both outputs and reference_outputs");
26
- }
27
- const isSubset = (0, utils_js_2._isTrajectorySuperset)(referenceOutputsList, outputsList);
28
- return isSubset;
29
- };
30
- return (0, utils_js_1._runEvaluator)("trajectory_subset", getScore, "trajectory_subset", params);
29
+ return (0, utils_js_1._runEvaluator)("trajectory_subset", exports._scorer, "trajectory_subset", {
30
+ ...params,
31
+ outputs: outputsList,
32
+ referenceOutputs: referenceOutputsList,
33
+ toolArgsMatchMode: "ignore",
34
+ });
31
35
  }
32
36
  exports.trajectorySubset = trajectorySubset;
@@ -1,6 +1,13 @@
1
1
  import { BaseMessage } from "@langchain/core/messages";
2
- import { ChatCompletionMessage, EvaluatorResult } from "../types.js";
2
+ import { ChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
3
+ export declare const _scorer: (params: {
4
+ outputs: ChatCompletionMessage[];
5
+ referenceOutputs: ChatCompletionMessage[];
6
+ toolArgsMatchMode: ToolArgsMatchMode;
7
+ toolArgsMatchOverrides?: ToolArgsMatchOverrides;
8
+ }) => Promise<boolean>;
3
9
  /**
10
+ * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "subset"` instead.
4
11
  * Evaluate whether an agent trajectory and called tools is a subset of a reference trajectory and called tools.
5
12
  * This means the agent called a subset of the tools specified in the reference trajectory.
6
13
  *
@@ -1,6 +1,11 @@
1
1
  import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
2
2
  import { _isTrajectorySuperset } from "./utils.js";
3
+ export const _scorer = async (params) => {
4
+ const isSubset = await _isTrajectorySuperset(params.referenceOutputs, params.outputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides);
5
+ return isSubset;
6
+ };
3
7
  /**
8
+ * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "subset"` instead.
4
9
  * Evaluate whether an agent trajectory and called tools is a subset of a reference trajectory and called tools.
5
10
  * This means the agent called a subset of the tools specified in the reference trajectory.
6
11
  *
@@ -17,12 +22,10 @@ export async function trajectorySubset(params) {
17
22
  const { outputs, referenceOutputs } = params;
18
23
  const outputsList = _normalizeToOpenAIMessagesList(outputs);
19
24
  const referenceOutputsList = _normalizeToOpenAIMessagesList(referenceOutputs);
20
- const getScore = async () => {
21
- if (outputsList == null || referenceOutputsList == null) {
22
- throw new Error("Trajectory subset match requires both outputs and reference_outputs");
23
- }
24
- const isSubset = _isTrajectorySuperset(referenceOutputsList, outputsList);
25
- return isSubset;
26
- };
27
- return _runEvaluator("trajectory_subset", getScore, "trajectory_subset", params);
25
+ return _runEvaluator("trajectory_subset", _scorer, "trajectory_subset", {
26
+ ...params,
27
+ outputs: outputsList,
28
+ referenceOutputs: referenceOutputsList,
29
+ toolArgsMatchMode: "ignore",
30
+ });
28
31
  }
@@ -1,9 +1,15 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.trajectorySuperset = void 0;
3
+ exports.trajectorySuperset = exports._scorer = void 0;
4
4
  const utils_js_1 = require("../utils.cjs");
5
5
  const utils_js_2 = require("./utils.cjs");
6
+ const _scorer = async (params) => {
7
+ const isSuperset = await (0, utils_js_2._isTrajectorySuperset)(params.outputs, params.referenceOutputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides);
8
+ return isSuperset;
9
+ };
10
+ exports._scorer = _scorer;
6
11
  /**
12
+ * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "superset"` instead.
7
13
  * Evaluate whether an agent trajectory and called tools is a superset of a reference trajectory and called tools.
8
14
  * This means the agent called a superset of the tools specified in the reference trajectory.
9
15
  *
@@ -20,13 +26,11 @@ async function trajectorySuperset(params) {
20
26
  const { outputs, referenceOutputs } = params;
21
27
  const outputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
22
28
  const referenceOutputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs);
23
- const getScore = async () => {
24
- if (outputsList == null || referenceOutputsList == null) {
25
- throw new Error("Trajectory superset match requires both outputs and reference_outputs");
26
- }
27
- const isSuperset = (0, utils_js_2._isTrajectorySuperset)(outputsList, referenceOutputsList);
28
- return isSuperset;
29
- };
30
- return (0, utils_js_1._runEvaluator)("trajectory_superset", getScore, "trajectory_superset", params);
29
+ return (0, utils_js_1._runEvaluator)("trajectory_superset", exports._scorer, "trajectory_superset", {
30
+ ...params,
31
+ outputs: outputsList,
32
+ referenceOutputs: referenceOutputsList,
33
+ toolArgsMatchMode: "ignore",
34
+ });
31
35
  }
32
36
  exports.trajectorySuperset = trajectorySuperset;
@@ -1,6 +1,13 @@
1
1
  import { BaseMessage } from "@langchain/core/messages";
2
- import { ChatCompletionMessage, EvaluatorResult } from "../types.js";
2
+ import { ChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
3
+ export declare const _scorer: (params: {
4
+ outputs: ChatCompletionMessage[];
5
+ referenceOutputs: ChatCompletionMessage[];
6
+ toolArgsMatchMode: ToolArgsMatchMode;
7
+ toolArgsMatchOverrides?: ToolArgsMatchOverrides;
8
+ }) => Promise<boolean>;
3
9
  /**
10
+ * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "superset"` instead.
4
11
  * Evaluate whether an agent trajectory and called tools is a superset of a reference trajectory and called tools.
5
12
  * This means the agent called a superset of the tools specified in the reference trajectory.
6
13
  *
@@ -1,6 +1,11 @@
1
1
  import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
2
2
  import { _isTrajectorySuperset } from "./utils.js";
3
+ export const _scorer = async (params) => {
4
+ const isSuperset = await _isTrajectorySuperset(params.outputs, params.referenceOutputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides);
5
+ return isSuperset;
6
+ };
3
7
  /**
8
+ * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "superset"` instead.
4
9
  * Evaluate whether an agent trajectory and called tools is a superset of a reference trajectory and called tools.
5
10
  * This means the agent called a superset of the tools specified in the reference trajectory.
6
11
  *
@@ -17,12 +22,10 @@ export async function trajectorySuperset(params) {
17
22
  const { outputs, referenceOutputs } = params;
18
23
  const outputsList = _normalizeToOpenAIMessagesList(outputs);
19
24
  const referenceOutputsList = _normalizeToOpenAIMessagesList(referenceOutputs);
20
- const getScore = async () => {
21
- if (outputsList == null || referenceOutputsList == null) {
22
- throw new Error("Trajectory superset match requires both outputs and reference_outputs");
23
- }
24
- const isSuperset = _isTrajectorySuperset(outputsList, referenceOutputsList);
25
- return isSuperset;
26
- };
27
- return _runEvaluator("trajectory_superset", getScore, "trajectory_superset", params);
25
+ return _runEvaluator("trajectory_superset", _scorer, "trajectory_superset", {
26
+ ...params,
27
+ outputs: outputsList,
28
+ referenceOutputs: referenceOutputsList,
29
+ toolArgsMatchMode: "ignore",
30
+ });
28
31
  }
@@ -1,9 +1,16 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.trajectoryUnorderedMatch = void 0;
3
+ exports.trajectoryUnorderedMatch = exports._scorer = void 0;
4
4
  const utils_js_1 = require("../utils.cjs");
5
5
  const utils_js_2 = require("./utils.cjs");
6
+ const _scorer = async (params) => {
7
+ const isUnorderedMatch = (await (0, utils_js_2._isTrajectorySuperset)(params.outputs, params.referenceOutputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides)) &&
8
+ (await (0, utils_js_2._isTrajectorySuperset)(params.referenceOutputs, params.outputs, params.toolArgsMatchMode, params.toolArgsMatchOverrides));
9
+ return isUnorderedMatch;
10
+ };
11
+ exports._scorer = _scorer;
6
12
  /**
13
+ * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "unordered"` instead.
7
14
  * Evaluate whether an input agent trajectory and called tools contains all the tools used in a reference trajectory.
8
15
  * This accounts for some differences in an LLM's reasoning process in a case-by-case basis.
9
16
  *
@@ -20,14 +27,11 @@ async function trajectoryUnorderedMatch(params) {
20
27
  const { outputs, referenceOutputs } = params;
21
28
  const outputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
22
29
  const referenceOutputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs);
23
- const getScore = async () => {
24
- if (outputsList == null || referenceOutputsList == null) {
25
- throw new Error("Trajectory unordered match requires both outputs and reference_outputs");
26
- }
27
- const unorderedMatch = (0, utils_js_2._isTrajectorySuperset)(outputsList, referenceOutputsList) &&
28
- (0, utils_js_2._isTrajectorySuperset)(referenceOutputsList, outputsList);
29
- return unorderedMatch;
30
- };
31
- return (0, utils_js_1._runEvaluator)("trajectory_unordered_match", getScore, "trajectory_unordered_match", params);
30
+ return (0, utils_js_1._runEvaluator)("trajectory_unordered_match", exports._scorer, "trajectory_unordered_match", {
31
+ ...params,
32
+ outputs: outputsList,
33
+ referenceOutputs: referenceOutputsList,
34
+ toolArgsMatchMode: "ignore",
35
+ });
32
36
  }
33
37
  exports.trajectoryUnorderedMatch = trajectoryUnorderedMatch;
@@ -1,6 +1,13 @@
1
1
  import { BaseMessage } from "@langchain/core/messages";
2
- import { ChatCompletionMessage, EvaluatorResult } from "../types.js";
2
+ import { ChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
3
+ export declare const _scorer: (params: {
4
+ outputs: ChatCompletionMessage[];
5
+ referenceOutputs: ChatCompletionMessage[];
6
+ toolArgsMatchMode: ToolArgsMatchMode;
7
+ toolArgsMatchOverrides?: ToolArgsMatchOverrides;
8
+ }) => Promise<boolean>;
3
9
  /**
10
+ * @deprecated Use `createTrajectoryMatchEvaluator` with `trajectoryMatchMode: "unordered"` instead.
4
11
  * Evaluate whether an input agent trajectory and called tools contains all the tools used in a reference trajectory.
5
12
  * This accounts for some differences in an LLM's reasoning process in a case-by-case basis.
6
13
  *