agentevals 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -86,7 +86,6 @@ You can see that despite the small difference in the final response and tool cal
86
86
  - [Graph Trajectory](#graph-trajectory)
87
87
  - [Graph trajectory LLM-as-judge](#graph-trajectory-llm-as-judge)
88
88
  - [Graph trajectory strict match](#graph-trajectory-strict-match)
89
- - [Python Async Support](#python-async-support)
90
89
  - [LangSmith Integration](#langsmith-integration)
91
90
  - [Pytest or Vitest/Jest](#pytest-or-vitestjest)
92
91
  - [Evaluate](#evaluate)
@@ -106,7 +105,7 @@ npm install openai
106
105
  ```
107
106
 
108
107
  It is also helpful to be familiar with some [evaluation concepts](https://docs.smith.langchain.com/evaluation/concepts) and
109
- LangSmith's Vitest/Jest integration for running evals, which is documented [here](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest).
108
+ LangSmith's pytest integration for running evals, which is documented [here](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest).
110
109
 
111
110
  ## Evaluators
112
111
 
@@ -116,21 +115,105 @@ Agent trajectory evaluators are used to judge the trajectory of an agent's execu
116
115
  These evaluators expect you to format your agent's trajectory as a list of OpenAI format dicts or as a list of LangChain `BaseMessage` classes, and handle message formatting
117
116
  under the hood.
118
117
 
118
+ AgentEvals offers the `create_trajectory_match_evaluator`/`createTrajectoryMatchEvaluator` and `create_async_trajectory_match_evaluator` methods for this task.
119
+
120
+ #### Checking tool call equality
121
+
122
+ When checking equality between tool calls, these matchers will require that all tool call arguments are the same. You can configure this behavior to ignore tool call arguments by setting `tool_args_match_mode="ignore"` (Python) or `toolArgsMatchMode: "ignore"` (JS), or by only checking specific properties within the call using the `tool_args_match_overrides`/`toolArgsMatchOverrides` param.
123
+
124
+ `tool_args_match_overrides`/`toolArgsMatchOverrides` takes a dictionary whose keys are tool names and whose values are either `"exact"`, `"ignore"`, a list of fields within the tool call that must match exactly, or a comparator function that takes two arguments and returns whether they are equal:
125
+
126
+ ```python
127
+ ToolArgsMatchMode = Literal["exact", "ignore"]
128
+
129
+ ToolArgsMatchOverrides = dict[str, Union[ToolArgsMatchMode, list[str], Callable[[dict, dict], bool]]]
130
+ ```
131
+
132
+ Here's an example that allows case insensitivity for the arguments to a tool named `get_weather`:
133
+
134
+ ```ts
135
+ import { createTrajectoryMatchEvaluator } from "agentevals";
136
+
137
+ const outputs = [
138
+ { role: "user", content: "What is the weather in SF?" },
139
+ {
140
+ role: "assistant",
141
+ tool_calls: [{
142
+ function: {
143
+ name: "get_weather",
144
+ arguments: JSON.stringify({ city: "san francisco" })
145
+ },
146
+ }]
147
+ },
148
+ { role: "tool", content: "It's 80 degrees and sunny in SF." },
149
+ { role: "assistant", content: "The weather in SF is 80 degrees and sunny." },
150
+ ];
151
+
152
+ const referenceOutputs = [
153
+ { role: "user", content: "What is the weather in San Francisco?" },
154
+ {
155
+ role: "assistant",
156
+ tool_calls: [{
157
+ function: {
158
+ name: "get_weather",
159
+ arguments: JSON.stringify({ city: "San Francisco" })
160
+ }
161
+ }]
162
+ },
163
+ { role: "tool", content: "It's 80 degrees and sunny in San Francisco." },
164
+ ];
165
+
166
+ const evaluator = createTrajectoryMatchEvaluator({
167
+ trajectoryMatchMode: "strict",
168
+ toolArgsMatchMode: "exact", // Default value
169
+ toolArgsMatchOverrides: {
170
+ get_weather: (x, y) => {
171
+ return typeof x.city === "string" &&
172
+ typeof y.city === "string" &&
173
+ x.city.toLowerCase() === y.city.toLowerCase();
174
+ },
175
+ }
176
+ });
177
+
178
+ const result = await evaluator({
179
+ outputs,
180
+ referenceOutputs,
181
+ });
182
+
183
+ console.log(result);
184
+ ```
185
+
186
+ ```
187
+ {
188
+ 'key': 'trajectory_strict_match',
189
+ 'score': true,
190
+ }
191
+ ```
192
+
193
+ This flexibility allows you to handle cases where you want looser equality for LLM generated arguments (`"san francisco"` to equal `"San Francisco"`) for only specific tool calls.
194
+
119
195
  #### Strict match
120
196
 
121
- The `trajectory_strict_match` evaluator, compares two trajectories and ensures that they contain the same messages
122
- in the same order with the same tool calls. It allows for differences in message content and tool call arguments,
123
- but requires that the selected tools at each step are the same.
197
+ The `"strict"` `trajectory_match_mode` compares two trajectories and ensures that they contain the same messages
198
+ in the same order with the same tool calls. Note that it does allow for differences in message content:
124
199
 
125
200
  ```ts
126
- import { trajectoryStrictMatch } from "agentevals";
201
+ import { createTrajectoryMatchEvaluator } from "agentevals";
127
202
 
128
203
  const outputs = [
129
204
  { role: "user", content: "What is the weather in SF?" },
130
205
  {
131
206
  role: "assistant",
132
207
  tool_calls: [{
133
- function: { name: "get_weather", arguments: JSON.stringify({ city: "SF" }) }
208
+ function: {
209
+ name: "get_weather",
210
+ arguments: JSON.stringify({ city: "San Francisco" })
211
+ },
212
+ }, {
213
+ function: {
214
+ name: "accuweather_forecast",
215
+ arguments: JSON.stringify({"city": "San Francisco"}),
216
+ },
134
217
  }]
135
218
  },
136
219
  { role: "tool", content: "It's 80 degrees and sunny in SF." },
@@ -143,7 +226,11 @@ const referenceOutputs = [
143
226
  { role: "tool", content: "It's 80 degrees and sunny in San Francisco." },
144
227
  ];
145
228
 
146
- const result = await trajectoryStrictMatch({
229
+ const evaluator = createTrajectoryMatchEvaluator({
230
+ trajectoryMatchMode: "strict",
231
+ })
232
+
233
+ const result = await evaluator({
147
234
  outputs,
148
235
  referenceOutputs,
149
236
  });
@@ -153,17 +240,21 @@ console.log(result);
153
240
 
154
241
  ```
155
242
  {
156
- 'key': 'trajectory_accuracy',
157
- 'score': true,
243
+ 'key': 'trajectory_strict_match',
244
+ 'score': false,
158
245
  }
159
246
  ```
160
247
 
248
+ `"strict"` is useful is if you want to ensure that tools are always called in the same order for a given query (e.g. a company policy lookup tool before a tool that requests vacation time for an employee).
249
+
250
+ **Note:** If you would like to configure the way this evaluator checks for tool call equality, see [this section](#checking-tool-call-equality).
251
+
161
252
  #### Unordered match
162
253
 
163
- The `trajectory_unordered_match` evaluator, compares two trajectories and ensures that they contain the same number of tool calls in any order. This is useful if you want to allow flexibility in how an agent obtains the proper information, but still do care that all information was retrieved.
254
+ The `"unordered"` `trajectory_match_mode` compares two trajectories and ensures that they contain the same tool calls in any order. This is useful if you want to allow flexibility in how an agent obtains the proper information, but still do care that all information was retrieved.
164
255
 
165
256
  ```ts
166
- import { trajectoryUnorderedMatch } from "agentevals";
257
+ import { createTrajectoryMatchEvaluator } from "agentevals";
167
258
 
168
259
  const outputs = [
169
260
  { role: "user", content: "What is the weather in SF and is there anything fun happening?" },
@@ -214,7 +305,11 @@ const referenceOutputs = [
214
305
  { role: "assistant", content: "In SF, it's 80˚ and sunny, but there is nothing fun happening." },
215
306
  ];
216
307
 
217
- const result = await trajectoryUnorderedMatch({
308
+ const evaluator = createTrajectoryMatchEvaluator({
309
+ trajectoryMatchMode: "unordered",
310
+ });
311
+
312
+ const result = await evaluator({
218
313
  outputs,
219
314
  referenceOutputs,
220
315
  });
@@ -229,13 +324,16 @@ console.log(result)
229
324
  }
230
325
  ```
231
326
 
327
+ `"unordered"` is useful is if you want to ensure that specific tools are called at some point in the trajectory, but you don't necessarily need them to be in message order (e.g. the agent called a company policy retrieval tool at an arbitrary point in an interaction before authorizing spend for a pizza party).
328
+
329
+ **Note:** If you would like to configure the way this evaluator checks for tool call equality, see [this section](#checking-tool-call-equality).
330
+
232
331
  #### Subset and superset match
233
332
 
234
- There are other evaluators for checking partial trajectory matches (ensuring that a trajectory contains a subset and superset of tool calls compared to a reference trajectory).
333
+ The `"subset"` and `"superset"` modes match partial trajectories (ensuring that a trajectory contains a subset/superset of tool calls contained in a reference trajectory).
235
334
 
236
335
  ```ts
237
- import { trajectorySubset } from "agentevals";
238
- // import { trajectorySuperset } from "agentevals";
336
+ import { createTrajectoryMatchEvaluator } from "agentevals";
239
337
 
240
338
  const outputs = [
241
339
  { role: "user", content: "What is the weather in SF and London?" },
@@ -246,9 +344,15 @@ const outputs = [
246
344
  name: "get_weather",
247
345
  arguments: JSON.stringify({ city: "SF and London" }),
248
346
  }
347
+ }, {
348
+ "function": {
349
+ name: "accuweather_forecast",
350
+ arguments: JSON.stringify({"city": "SF and London"}),
351
+ }
249
352
  }],
250
353
  },
251
354
  { role: "tool", content: "It's 80 degrees and sunny in SF, and 90 degrees and rainy in London." },
355
+ { role: "tool", content: "Unknown." },
252
356
  { role: "assistant", content: "The weather in SF is 80 degrees and sunny. In London, it's 90 degrees and rainy."},
253
357
  ];
254
358
 
@@ -260,23 +364,20 @@ const referenceOutputs = [
260
364
  {
261
365
  function: {
262
366
  name: "get_weather",
263
- arguments: JSON.stringify({ city: "San Francisco" }),
264
- }
265
- },
266
- {
267
- function: {
268
- name: "get_weather",
269
- arguments: JSON.stringify({ city: "London" }),
367
+ arguments: JSON.stringify({ city: "SF and London" }),
270
368
  }
271
369
  },
272
370
  ],
273
371
  },
274
- { role: "tool", content: "It's 80 degrees and sunny in San Francisco." },
275
- { role: "tool", content: "It's 90 degrees and rainy in London." },
372
+ { role: "tool", content: "It's 80 degrees and sunny in San Francisco, and 90 degrees and rainy in London." },
276
373
  { role: "assistant", content: "The weather in SF is 80˚ and sunny. In London, it's 90˚ and rainy." },
277
374
  ];
278
375
 
279
- const result = await trajectorySubset({
376
+ const evaluator = createTrajectoryMatchEvaluator({
377
+ trajectoryMatchMode: "superset", // or "subset"
378
+ });
379
+
380
+ const result = await evaluator({
280
381
  outputs,
281
382
  referenceOutputs,
282
383
  });
@@ -286,11 +387,15 @@ console.log(result)
286
387
 
287
388
  ```
288
389
  {
289
- 'key': 'trajectory_subset',
390
+ 'key': 'trajectory_superset_match',
290
391
  'score': true,
291
392
  }
292
393
  ```
293
394
 
395
+ `"superset"` is useful if you want to ensure that some key tools were called at some point in the trajectory, but an agent calling extra tools is still acceptable. `"subset"` is the inverse and is useful if you want to ensure that the agent did not call any tools beyond the expected ones.
396
+
397
+ **Note:** If you would like to configure the way this evaluator checks for tool call equality, see [this section](#checking-tool-call-equality).
398
+
294
399
  #### Trajectory LLM-as-judge
295
400
 
296
401
  The LLM-as-judge trajectory evaluator that uses an LLM to evaluate the trajectory. Unlike the other trajectory evaluators, it doesn't require a reference trajectory,
@@ -514,7 +619,7 @@ console.log(res);
514
619
  }
515
620
  ```
516
621
 
517
- Note that though this evaluator takes the typical `inputs`, `outputs`, and `referenceOutputs` parameters, it internally combines `inputs` and `outputs` to form a `thread`. Therefore, if you want to customize the prompt, your prompt should also contain a `thread` input variable:
622
+ Note that though this evaluator takes the typical `inputs`, `outputs`, and `reference_outputs` parameters, it internally combines `inputs` and `outputs` to form a `thread`. Therefore, if you want to customize the prompt, your prompt should also contain a `thread` input variable:
518
623
 
519
624
  ```ts
520
625
  const CUSTOM_PROMPT = `You are an expert data labeler.
@@ -546,18 +651,18 @@ const graphTrajectoryEvaluator = createGraphTrajectoryLLMAsJudge({
546
651
  model: "openai:o3-mini",
547
652
  })
548
653
  res = await graphTrajectoryEvaluator(
549
- inputs=extractedTrajectory.inputs,
550
- outputs=extractedTrajectory.outputs,
654
+ inputs: extractedTrajectory.inputs,
655
+ outputs: extractedTrajectory.outputs,
551
656
  )
552
657
  ```
553
658
 
554
- In order to format them properly into the prompt, `referenceOutputs` should be passed in as a `GraphTrajectory` object like `outputs`.
659
+ In order to format them properly into the prompt, `reference_outputs` should be passed in as a `GraphTrajectory` object like `outputs`.
555
660
 
556
661
  Also note that like other LLM-as-judge evaluators, you can pass extra kwargs into the evaluator to format them into the prompt.
557
662
 
558
663
  #### Graph trajectory strict match
559
664
 
560
- The `graphTrajectoryStrictMatch` evaluator is a simple evaluator that checks if the steps in the provided graph trajectory match the reference trajectory exactly.
665
+ The `graph_trajectory_strict_match` evaluator is a simple evaluator that checks if the steps in the provided graph trajectory match the reference trajectory exactly.
561
666
 
562
667
  ```ts
563
668
  import { tool } from "@langchain/core/tools";
@@ -626,23 +731,24 @@ console.log(result);
626
731
  'score': True,
627
732
  }
628
733
  ```
734
+
629
735
  ## LangSmith Integration
630
736
 
631
737
  For tracking experiments over time, you can log evaluator results to [LangSmith](https://smith.langchain.com/), a platform for building production-grade LLM applications that includes tracing, evaluation, and experimentation tools.
632
738
 
633
- LangSmith currently offers two ways to run evals. We'll give a quick example of how to run evals using both.
739
+ LangSmith currently offers two ways to run evals: a [pytest](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest) (Python) or [Vitest/Jest](https://docs.smith.langchain.com/evaluation/how_to_guides/vitest_jest) integration and the `evaluate` function. We'll give a quick example of how to run evals using both.
634
740
 
635
741
  ### Pytest or Vitest/Jest
636
742
 
637
- First, follow [these instructions](https://docs.smith.langchain.com/evaluation/how_to_guides/vitest_jest) to set up LangSmith's Vitest/Jest runner,
743
+ First, follow [these instructions](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest) to set up LangSmith's pytest runner, or these to set up [Vitest or Jest](https://docs.smith.langchain.com/evaluation/how_to_guides/vitest_jest),
638
744
  setting appropriate environment variables:
639
745
 
746
+
640
747
  ```bash
641
748
  export LANGSMITH_API_KEY="your_langsmith_api_key"
642
749
  export LANGSMITH_TRACING="true"
643
750
  ```
644
751
 
645
-
646
752
  Then, set up a file named `test_trajectory.eval.ts` with the following contents:
647
753
 
648
754
  ```ts
@@ -717,7 +823,6 @@ Now, run the eval with your runner of choice:
717
823
  vitest run test_trajectory.eval.ts
718
824
  ```
719
825
 
720
-
721
826
  Feedback from the prebuilt evaluator will be automatically logged in LangSmith as a table of results like this in your terminal:
722
827
 
723
828
  ![Terminal results](/static/img/pytest_output.png)
package/dist/index.cjs CHANGED
@@ -14,7 +14,7 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
14
  for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
15
  };
16
16
  Object.defineProperty(exports, "__esModule", { value: true });
17
- exports.GRAPH_TRAJECTORY_ACCURACY_PROMPT = exports.createGraphTrajectoryLLMAsJudge = exports.TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = exports.TRAJECTORY_ACCURACY_PROMPT = exports.createTrajectoryLLMAsJudge = exports.trajectoryUnorderedMatch = exports.trajectorySuperset = exports.trajectorySubset = exports.trajectoryStrictMatch = void 0;
17
+ exports.GRAPH_TRAJECTORY_ACCURACY_PROMPT = exports.createGraphTrajectoryLLMAsJudge = exports.TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = exports.TRAJECTORY_ACCURACY_PROMPT = exports.createTrajectoryLLMAsJudge = exports.createTrajectoryMatchEvaluator = exports.trajectoryUnorderedMatch = exports.trajectorySuperset = exports.trajectorySubset = exports.trajectoryStrictMatch = void 0;
18
18
  var strict_js_1 = require("./trajectory/strict.cjs");
19
19
  Object.defineProperty(exports, "trajectoryStrictMatch", { enumerable: true, get: function () { return strict_js_1.trajectoryStrictMatch; } });
20
20
  var subset_js_1 = require("./trajectory/subset.cjs");
@@ -23,6 +23,8 @@ var superset_js_1 = require("./trajectory/superset.cjs");
23
23
  Object.defineProperty(exports, "trajectorySuperset", { enumerable: true, get: function () { return superset_js_1.trajectorySuperset; } });
24
24
  var unordered_js_1 = require("./trajectory/unordered.cjs");
25
25
  Object.defineProperty(exports, "trajectoryUnorderedMatch", { enumerable: true, get: function () { return unordered_js_1.trajectoryUnorderedMatch; } });
26
+ var match_js_1 = require("./trajectory/match.cjs");
27
+ Object.defineProperty(exports, "createTrajectoryMatchEvaluator", { enumerable: true, get: function () { return match_js_1.createTrajectoryMatchEvaluator; } });
26
28
  var llm_js_1 = require("./trajectory/llm.cjs");
27
29
  Object.defineProperty(exports, "createTrajectoryLLMAsJudge", { enumerable: true, get: function () { return llm_js_1.createTrajectoryLLMAsJudge; } });
28
30
  Object.defineProperty(exports, "TRAJECTORY_ACCURACY_PROMPT", { enumerable: true, get: function () { return llm_js_1.TRAJECTORY_ACCURACY_PROMPT; } });
package/dist/index.d.ts CHANGED
@@ -2,6 +2,7 @@ export { trajectoryStrictMatch } from "./trajectory/strict.js";
2
2
  export { trajectorySubset } from "./trajectory/subset.js";
3
3
  export { trajectorySuperset } from "./trajectory/superset.js";
4
4
  export { trajectoryUnorderedMatch } from "./trajectory/unordered.js";
5
+ export { createTrajectoryMatchEvaluator, type TrajectoryMatchMode, } from "./trajectory/match.js";
5
6
  export { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, } from "./trajectory/llm.js";
6
7
  export { createGraphTrajectoryLLMAsJudge, GRAPH_TRAJECTORY_ACCURACY_PROMPT, } from "./graph_trajectory/llm.js";
7
8
  export * from "./types.js";
package/dist/index.js CHANGED
@@ -2,6 +2,7 @@ export { trajectoryStrictMatch } from "./trajectory/strict.js";
2
2
  export { trajectorySubset } from "./trajectory/subset.js";
3
3
  export { trajectorySuperset } from "./trajectory/superset.js";
4
4
  export { trajectoryUnorderedMatch } from "./trajectory/unordered.js";
5
+ export { createTrajectoryMatchEvaluator, } from "./trajectory/match.js";
5
6
  export { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, } from "./trajectory/llm.js";
6
7
  export { createGraphTrajectoryLLMAsJudge, GRAPH_TRAJECTORY_ACCURACY_PROMPT, } from "./graph_trajectory/llm.js";
7
8
  export * from "./types.js";
@@ -0,0 +1,84 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.createTrajectoryMatchEvaluator = void 0;
4
+ const utils_js_1 = require("../utils.cjs");
5
+ const strict_js_1 = require("./strict.cjs");
6
+ const unordered_js_1 = require("./unordered.cjs");
7
+ const subset_js_1 = require("./subset.cjs");
8
+ const superset_js_1 = require("./superset.cjs");
9
+ /**
10
+ * Creates an evaluator that compares trajectories between model outputs and reference outputs.
11
+ *
12
+ * @param options - The configuration options
13
+ * @param options.trajectoryMatchMode - The mode for matching trajectories:
14
+ * - `"strict"`: Requires exact match in order and content
15
+ * - `"unordered"`: Allows matching in any order
16
+ * - `"subset"`: Accepts if output trajectory is a subset of reference
17
+ * - `"superset"`: Accepts if output trajectory is a superset of reference
18
+ * @param options.toolArgsMatchMode - Mode for matching tool arguments ("exact" by default, can be "ignore")
19
+ * @param options.toolArgsMatchOverrides - Object containing custom overrides for tool argument matching.
20
+ * Each key should be a tool name, and each value should be either a match mode or a matcher function.
21
+ * Matchers should be a function that takes two sets of tool call args and returns whether they are equal.
22
+ *
23
+ * @returns An async function that evaluates trajectory matches between outputs and references.
24
+ * The returned evaluator accepts:
25
+ * - outputs: List of messages or dict representing the model output trajectory
26
+ * - referenceOutputs: List of messages or dict representing the reference trajectory
27
+ * - Additional arguments passed to the underlying evaluator
28
+ *
29
+ * @example
30
+ * ```typescript
31
+ * const matcher = (
32
+ * outputToolCallArgs: Record<string, any>,
33
+ * referenceToolCallArgs: Record<string, any>
34
+ * ): boolean => {
35
+ * const outputArgs = (outputToolCallArgs.query ?? "").toLowerCase();
36
+ * const referenceArgs = (referenceToolCallArgs.query ?? "").toLowerCase();
37
+ * return outputArgs === referenceArgs;
38
+ * };
39
+ *
40
+ * const evaluator = createAsyncTrajectoryMatchEvaluator({
41
+ * trajectoryMatchMode: "strict",
42
+ * toolArgsMatchMode: "exact",
43
+ * toolArgsMatchOverrides: {
44
+ * myToolName: matcher,
45
+ * },
46
+ * });
47
+ *
48
+ * const result = await evaluator({
49
+ * outputs: [...],
50
+ * referenceOutputs: [...],
51
+ * });
52
+ * ```
53
+ */
54
+ function createTrajectoryMatchEvaluator({ trajectoryMatchMode = "strict", toolArgsMatchMode = "exact", toolArgsMatchOverrides, }) {
55
+ let scorer;
56
+ switch (trajectoryMatchMode) {
57
+ case "strict":
58
+ scorer = strict_js_1._scorer;
59
+ break;
60
+ case "unordered":
61
+ scorer = unordered_js_1._scorer;
62
+ break;
63
+ case "subset":
64
+ scorer = subset_js_1._scorer;
65
+ break;
66
+ case "superset":
67
+ scorer = superset_js_1._scorer;
68
+ break;
69
+ default:
70
+ throw new Error(`Invalid trajectory match type: ${trajectoryMatchMode}`);
71
+ }
72
+ return async function _wrappedEvaluator({ outputs, referenceOutputs, ...extra }) {
73
+ const normalizedOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
74
+ const normalizedReferenceOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs);
75
+ return (0, utils_js_1._runEvaluator)(`trajectory_${trajectoryMatchMode}_match`, scorer, `trajectory_${trajectoryMatchMode}_match`, {
76
+ outputs: normalizedOutputs,
77
+ referenceOutputs: normalizedReferenceOutputs,
78
+ toolArgsMatchMode,
79
+ toolArgsMatchOverrides,
80
+ ...extra,
81
+ });
82
+ };
83
+ }
84
+ exports.createTrajectoryMatchEvaluator = createTrajectoryMatchEvaluator;
@@ -0,0 +1,61 @@
1
+ import { BaseMessage } from "@langchain/core/messages";
2
+ import { ChatCompletionMessage, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
3
+ export type TrajectoryMatchMode = "strict" | "unordered" | "subset" | "superset";
4
+ /**
5
+ * Creates an evaluator that compares trajectories between model outputs and reference outputs.
6
+ *
7
+ * @param options - The configuration options
8
+ * @param options.trajectoryMatchMode - The mode for matching trajectories:
9
+ * - `"strict"`: Requires exact match in order and content
10
+ * - `"unordered"`: Allows matching in any order
11
+ * - `"subset"`: Accepts if output trajectory is a subset of reference
12
+ * - `"superset"`: Accepts if output trajectory is a superset of reference
13
+ * @param options.toolArgsMatchMode - Mode for matching tool arguments ("exact" by default, can be "ignore")
14
+ * @param options.toolArgsMatchOverrides - Object containing custom overrides for tool argument matching.
15
+ * Each key should be a tool name, and each value should be either a match mode or a matcher function.
16
+ * Matchers should be a function that takes two sets of tool call args and returns whether they are equal.
17
+ *
18
+ * @returns An async function that evaluates trajectory matches between outputs and references.
19
+ * The returned evaluator accepts:
20
+ * - outputs: List of messages or dict representing the model output trajectory
21
+ * - referenceOutputs: List of messages or dict representing the reference trajectory
22
+ * - Additional arguments passed to the underlying evaluator
23
+ *
24
+ * @example
25
+ * ```typescript
26
+ * const matcher = (
27
+ * outputToolCallArgs: Record<string, any>,
28
+ * referenceToolCallArgs: Record<string, any>
29
+ * ): boolean => {
30
+ * const outputArgs = (outputToolCallArgs.query ?? "").toLowerCase();
31
+ * const referenceArgs = (referenceToolCallArgs.query ?? "").toLowerCase();
32
+ * return outputArgs === referenceArgs;
33
+ * };
34
+ *
35
+ * const evaluator = createAsyncTrajectoryMatchEvaluator({
36
+ * trajectoryMatchMode: "strict",
37
+ * toolArgsMatchMode: "exact",
38
+ * toolArgsMatchOverrides: {
39
+ * myToolName: matcher,
40
+ * },
41
+ * });
42
+ *
43
+ * const result = await evaluator({
44
+ * outputs: [...],
45
+ * referenceOutputs: [...],
46
+ * });
47
+ * ```
48
+ */
49
+ export declare function createTrajectoryMatchEvaluator({ trajectoryMatchMode, toolArgsMatchMode, toolArgsMatchOverrides, }: {
50
+ trajectoryMatchMode?: TrajectoryMatchMode;
51
+ toolArgsMatchMode?: ToolArgsMatchMode;
52
+ toolArgsMatchOverrides?: ToolArgsMatchOverrides;
53
+ }): ({ outputs, referenceOutputs, ...extra }: {
54
+ [key: string]: unknown;
55
+ outputs: ChatCompletionMessage[] | BaseMessage[] | {
56
+ messages: (BaseMessage | ChatCompletionMessage)[];
57
+ };
58
+ referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
59
+ messages: (BaseMessage | ChatCompletionMessage)[];
60
+ };
61
+ }) => Promise<import("langsmith/vitest").SimpleEvaluationResult>;
@@ -0,0 +1,80 @@
1
+ import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
2
+ import { _scorer as trajectoryStrictScorer } from "./strict.js";
3
+ import { _scorer as trajectoryUnorderedScorer } from "./unordered.js";
4
+ import { _scorer as trajectorySubsetScorer } from "./subset.js";
5
+ import { _scorer as trajectorySuperstScorer } from "./superset.js";
6
+ /**
7
+ * Creates an evaluator that compares trajectories between model outputs and reference outputs.
8
+ *
9
+ * @param options - The configuration options
10
+ * @param options.trajectoryMatchMode - The mode for matching trajectories:
11
+ * - `"strict"`: Requires exact match in order and content
12
+ * - `"unordered"`: Allows matching in any order
13
+ * - `"subset"`: Accepts if output trajectory is a subset of reference
14
+ * - `"superset"`: Accepts if output trajectory is a superset of reference
15
+ * @param options.toolArgsMatchMode - Mode for matching tool arguments ("exact" by default, can be "ignore")
16
+ * @param options.toolArgsMatchOverrides - Object containing custom overrides for tool argument matching.
17
+ * Each key should be a tool name, and each value should be either a match mode or a matcher function.
18
+ * Matchers should be a function that takes two sets of tool call args and returns whether they are equal.
19
+ *
20
+ * @returns An async function that evaluates trajectory matches between outputs and references.
21
+ * The returned evaluator accepts:
22
+ * - outputs: List of messages or dict representing the model output trajectory
23
+ * - referenceOutputs: List of messages or dict representing the reference trajectory
24
+ * - Additional arguments passed to the underlying evaluator
25
+ *
26
+ * @example
27
+ * ```typescript
28
+ * const matcher = (
29
+ * outputToolCallArgs: Record<string, any>,
30
+ * referenceToolCallArgs: Record<string, any>
31
+ * ): boolean => {
32
+ * const outputArgs = (outputToolCallArgs.query ?? "").toLowerCase();
33
+ * const referenceArgs = (referenceToolCallArgs.query ?? "").toLowerCase();
34
+ * return outputArgs === referenceArgs;
35
+ * };
36
+ *
37
+ * const evaluator = createAsyncTrajectoryMatchEvaluator({
38
+ * trajectoryMatchMode: "strict",
39
+ * toolArgsMatchMode: "exact",
40
+ * toolArgsMatchOverrides: {
41
+ * myToolName: matcher,
42
+ * },
43
+ * });
44
+ *
45
+ * const result = await evaluator({
46
+ * outputs: [...],
47
+ * referenceOutputs: [...],
48
+ * });
49
+ * ```
50
+ */
51
+ export function createTrajectoryMatchEvaluator({ trajectoryMatchMode = "strict", toolArgsMatchMode = "exact", toolArgsMatchOverrides, }) {
52
+ let scorer;
53
+ switch (trajectoryMatchMode) {
54
+ case "strict":
55
+ scorer = trajectoryStrictScorer;
56
+ break;
57
+ case "unordered":
58
+ scorer = trajectoryUnorderedScorer;
59
+ break;
60
+ case "subset":
61
+ scorer = trajectorySubsetScorer;
62
+ break;
63
+ case "superset":
64
+ scorer = trajectorySuperstScorer;
65
+ break;
66
+ default:
67
+ throw new Error(`Invalid trajectory match type: ${trajectoryMatchMode}`);
68
+ }
69
+ return async function _wrappedEvaluator({ outputs, referenceOutputs, ...extra }) {
70
+ const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs);
71
+ const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList(referenceOutputs);
72
+ return _runEvaluator(`trajectory_${trajectoryMatchMode}_match`, scorer, `trajectory_${trajectoryMatchMode}_match`, {
73
+ outputs: normalizedOutputs,
74
+ referenceOutputs: normalizedReferenceOutputs,
75
+ toolArgsMatchMode,
76
+ toolArgsMatchOverrides,
77
+ ...extra,
78
+ });
79
+ };
80
+ }