agentevals 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -9,7 +9,7 @@ It is intended to provide a good conceptual starting point for your agent's eval
9
9
 
10
10
  If you are looking for more general evaluation tools, please check out the companion package [`openevals`](https://github.com/langchain-ai/openevals).
11
11
 
12
- ## Quickstart
12
+ # Quickstart
13
13
 
14
14
  To get started, install `agentevals`:
15
15
 
@@ -72,25 +72,29 @@ console.log(evalResult);
72
72
  }
73
73
  ```
74
74
 
75
- You can see that despite the small difference in the final response and tool calls, the evaluator still returns a score of `true` since the overall trajectory is the same between the output and reference!
75
+ You can see that the evaluator returns a score of `true` since the overall trajectory is a reasonable path for the agent to take to answer the user's question.
76
76
 
77
- ## Table of Contents
77
+ For more details on this evaluator, including how to customize it, see the section on [trajectory LLM-as-judge](#trajectory-llm-as-judge).
78
+
79
+ # Table of Contents
78
80
 
79
81
  - [Installation](#installation)
80
82
  - [Evaluators](#evaluators)
81
- - [Agent Trajectory](#agent-trajectory)
83
+ - [Agent Trajectory Match](#agent-trajectory-match)
82
84
  - [Strict match](#strict-match)
83
85
  - [Unordered match](#unordered-match)
84
86
  - [Subset/superset match](#subset-and-superset-match)
85
- - [Trajectory LLM-as-judge](#trajectory-llm-as-judge)
87
+ - [Tool args match modes](#tool-args-match-modes)
88
+ - [Trajectory LLM-as-judge](#trajectory-llm-as-judge)
86
89
  - [Graph Trajectory](#graph-trajectory)
87
90
  - [Graph trajectory LLM-as-judge](#graph-trajectory-llm-as-judge)
88
91
  - [Graph trajectory strict match](#graph-trajectory-strict-match)
92
+ - [Python Async Support](#python-async-support)
89
93
  - [LangSmith Integration](#langsmith-integration)
90
94
  - [Pytest or Vitest/Jest](#pytest-or-vitestjest)
91
95
  - [Evaluate](#evaluate)
92
96
 
93
- ## Installation
97
+ # Installation
94
98
 
95
99
  You can install `agentevals` like this:
96
100
 
@@ -107,92 +111,20 @@ npm install openai
107
111
  It is also helpful to be familiar with some [evaluation concepts](https://docs.smith.langchain.com/evaluation/concepts) and
108
112
  LangSmith's pytest integration for running evals, which is documented [here](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest).
109
113
 
110
- ## Evaluators
114
+ # Evaluators
111
115
 
112
- ### Agent trajectory
116
+ ## Agent trajectory match
113
117
 
114
- Agent trajectory evaluators are used to judge the trajectory of an agent's execution either against an expected trajectory or using an LLM.
118
+ Agent trajectory match evaluators are used to judge the trajectory of an agent's execution either against an expected trajectory or using an LLM.
115
119
  These evaluators expect you to format your agent's trajectory as a list of OpenAI format dicts or as a list of LangChain `BaseMessage` classes, and handle message formatting
116
120
  under the hood.
117
121
 
118
- AgentEvals offers the `create_trajectory_match_evaluator`/`createTrajectoryMatchEvaluator` and `create_async_trajectory_match_evaluator` methods for this task.
119
-
120
- #### Checking tool call equality
121
-
122
- When checking equality between tool calls, these matchers will require that all tool call arguments are the same. You can configure this behavior to ignore tool call arguments by setting `tool_args_match_mode="ignore"` (Python) or `toolArgsMatchMode: "ignore"` (JS), or by only checking specific properties within the call using the `tool_args_match_overrides`/`toolArgsMatchOverrides` param.
123
-
124
- `tool_args_match_overrides`/`toolArgsMatchOverrides` takes a dictionary whose keys are tool names and whose values are either `"exact"`, `"ignore"`, a list of fields within the tool call that must match exactly, or a comparator function that takes two arguments and returns whether they are equal:
125
-
126
- ```python
127
- ToolArgsMatchMode = Literal["exact", "ignore"]
128
-
129
- ToolArgsMatchOverrides = dict[str, Union[ToolArgsMatchMode, list[str], Callable[[dict, dict], bool]]]
130
- ```
131
-
132
- Here's an example that allows case insensitivity for the arguments to a tool named `get_weather`:
133
-
134
- ```ts
135
- import { createTrajectoryMatchEvaluator } from "agentevals";
136
-
137
- const outputs = [
138
- { role: "user", content: "What is the weather in SF?" },
139
- {
140
- role: "assistant",
141
- tool_calls: [{
142
- function: {
143
- name: "get_weather",
144
- arguments: JSON.stringify({ city: "san francisco" })
145
- },
146
- }]
147
- },
148
- { role: "tool", content: "It's 80 degrees and sunny in SF." },
149
- { role: "assistant", content: "The weather in SF is 80 degrees and sunny." },
150
- ];
151
-
152
- const referenceOutputs = [
153
- { role: "user", content: "What is the weather in San Francisco?" },
154
- {
155
- role: "assistant",
156
- tool_calls: [{
157
- function: {
158
- name: "get_weather",
159
- arguments: JSON.stringify({ city: "San Francisco" })
160
- }
161
- }]
162
- },
163
- { role: "tool", content: "It's 80 degrees and sunny in San Francisco." },
164
- ];
122
+ AgentEvals offers the `create_trajectory_match_evaluator`/`createTrajectoryMatchEvaluator` and `create_async_trajectory_match_evaluator` methods for this task. You can customize their behavior in a few ways:
165
123
 
166
- const evaluator = createTrajectoryMatchEvaluator({
167
- trajectoryMatchMode: "strict",
168
- toolArgsMatchMode: "exact", // Default value
169
- toolArgsMatchOverrides: {
170
- get_weather: (x, y) => {
171
- return typeof x.city === "string" &&
172
- typeof y.city === "string" &&
173
- x.city.toLowerCase() === y.city.toLowerCase();
174
- },
175
- }
176
- });
177
-
178
- const result = await evaluator({
179
- outputs,
180
- referenceOutputs,
181
- });
182
-
183
- console.log(result);
184
- ```
185
-
186
- ```
187
- {
188
- 'key': 'trajectory_strict_match',
189
- 'score': true,
190
- }
191
- ```
192
-
193
- This flexibility allows you to handle cases where you want looser equality for LLM generated arguments (`"san francisco"` to equal `"San Francisco"`) for only specific tool calls.
124
+ - Setting `trajectory_match_mode`/`trajectoryMatchMode` to [`strict`](#strict-match), [`unordered`](#unordered-match), [`subset`](#subset-and-superset-match), or [`superset`](#subset-and-superset-match) to provide the general strategy the evaluator will use to compare trajectories
125
+ - Setting [`tool_args_match_mode`](#tool-args-match-modes) and/or [`tool_args_match_overrides`](#tool-args-match-modes) to customize how the evaluator considers equality between tool calls in the actual trajectory vs. the reference. By default, only tool calls with the same arguments to the same tool are considered equal.
194
126
 
195
- #### Strict match
127
+ ### Strict match
196
128
 
197
129
  The `"strict"` `trajectory_match_mode` compares two trajectories and ensures that they contain the same messages
198
130
  in the same order with the same tool calls. Note that it does allow for differences in message content:
@@ -201,29 +133,39 @@ in the same order with the same tool calls. Note that it does allow for differen
201
133
  import { createTrajectoryMatchEvaluator } from "agentevals";
202
134
 
203
135
  const outputs = [
204
- { role: "user", content: "What is the weather in SF?" },
205
- {
206
- role: "assistant",
207
- tool_calls: [{
208
- function: {
209
- name: "get_weather",
210
- arguments: JSON.stringify({ city: "San Francisco" })
211
- },
212
- }, {
213
- function: {
214
- name: "accuweather_forecast",
215
- arguments: JSON.stringify({"city": "San Francisco"}),
216
- },
217
- }]
218
- },
219
- { role: "tool", content: "It's 80 degrees and sunny in SF." },
220
- { role: "assistant", content: "The weather in SF is 80 degrees and sunny." },
136
+ { role: "user", content: "What is the weather in SF?" },
137
+ {
138
+ role: "assistant",
139
+ content: "",
140
+ tool_calls: [{
141
+ function: {
142
+ name: "get_weather",
143
+ arguments: JSON.stringify({ city: "San Francisco" })
144
+ },
145
+ }, {
146
+ function: {
147
+ name: "accuweather_forecast",
148
+ arguments: JSON.stringify({"city": "San Francisco"}),
149
+ },
150
+ }]
151
+ },
152
+ { role: "tool", content: "It's 80 degrees and sunny in SF." },
153
+ { role: "assistant", content: "The weather in SF is 80 degrees and sunny." },
221
154
  ];
222
155
 
223
156
  const referenceOutputs = [
224
- { role: "user", content: "What is the weather in San Francisco?" },
225
- { role: "assistant", tool_calls: [{ function: { name: "get_weather", arguments: JSON.stringify({ city: "San Francisco" }) } }] },
226
- { role: "tool", content: "It's 80 degrees and sunny in San Francisco." },
157
+ { role: "user", content: "What is the weather in San Francisco?" },
158
+ {
159
+ role: "assistant",
160
+ content: "",
161
+ tool_calls: [{
162
+ function: {
163
+ name: "get_weather",
164
+ arguments: JSON.stringify({ city: "San Francisco" })
165
+ }
166
+ }]
167
+ },
168
+ { role: "tool", content: "It's 80 degrees and sunny in San Francisco." },
227
169
  ];
228
170
 
229
171
  const evaluator = createTrajectoryMatchEvaluator({
@@ -247,9 +189,9 @@ console.log(result);
247
189
 
248
190
  `"strict"` is useful is if you want to ensure that tools are always called in the same order for a given query (e.g. a company policy lookup tool before a tool that requests vacation time for an employee).
249
191
 
250
- **Note:** If you would like to configure the way this evaluator checks for tool call equality, see [this section](#checking-tool-call-equality).
192
+ **Note:** If you would like to configure the way this evaluator checks for tool call equality, see [this section](#tool-args-match-modes).
251
193
 
252
- #### Unordered match
194
+ ### Unordered match
253
195
 
254
196
  The `"unordered"` `trajectory_match_mode` compares two trajectories and ensures that they contain the same tool calls in any order. This is useful if you want to allow flexibility in how an agent obtains the proper information, but still do care that all information was retrieved.
255
197
 
@@ -260,6 +202,7 @@ const outputs = [
260
202
  { role: "user", content: "What is the weather in SF and is there anything fun happening?" },
261
203
  {
262
204
  role: "assistant",
205
+ content: "",
263
206
  tool_calls: [{
264
207
  function: {
265
208
  name: "get_weather",
@@ -270,6 +213,7 @@ const outputs = [
270
213
  { role: "tool", content: "It's 80 degrees and sunny in SF." },
271
214
  {
272
215
  role: "assistant",
216
+ content: "",
273
217
  tool_calls: [{
274
218
  function: {
275
219
  name: "get_fun_activities",
@@ -285,6 +229,7 @@ const referenceOutputs = [
285
229
  { role: "user", content: "What is the weather in SF and is there anything fun happening?" },
286
230
  {
287
231
  role: "assistant",
232
+ content: "",
288
233
  tool_calls: [
289
234
  {
290
235
  function: {
@@ -326,9 +271,9 @@ console.log(result)
326
271
 
327
272
  `"unordered"` is useful is if you want to ensure that specific tools are called at some point in the trajectory, but you don't necessarily need them to be in message order (e.g. the agent called a company policy retrieval tool at an arbitrary point in an interaction before authorizing spend for a pizza party).
328
273
 
329
- **Note:** If you would like to configure the way this evaluator checks for tool call equality, see [this section](#checking-tool-call-equality).
274
+ **Note:** If you would like to configure the way this evaluator checks for tool call equality, see [this section](#tool-args-match-modes).
330
275
 
331
- #### Subset and superset match
276
+ ### Subset and superset match
332
277
 
333
278
  The `"subset"` and `"superset"` modes match partial trajectories (ensuring that a trajectory contains a subset/superset of tool calls contained in a reference trajectory).
334
279
 
@@ -339,6 +284,7 @@ const outputs = [
339
284
  { role: "user", content: "What is the weather in SF and London?" },
340
285
  {
341
286
  role: "assistant",
287
+ content: "",
342
288
  tool_calls: [{
343
289
  function: {
344
290
  name: "get_weather",
@@ -360,6 +306,7 @@ const referenceOutputs = [
360
306
  { role: "user", content: "What is the weather in SF and London?" },
361
307
  {
362
308
  role: "assistant",
309
+ content: "",
363
310
  tool_calls: [
364
311
  {
365
312
  function: {
@@ -394,13 +341,138 @@ console.log(result)
394
341
 
395
342
  `"superset"` is useful if you want to ensure that some key tools were called at some point in the trajectory, but an agent calling extra tools is still acceptable. `"subset"` is the inverse and is useful if you want to ensure that the agent did not call any tools beyond the expected ones.
396
343
 
397
- **Note:** If you would like to configure the way this evaluator checks for tool call equality, see [this section](#checking-tool-call-equality).
344
+ **Note:** If you would like to configure the way this evaluator checks for tool call equality, see [this section](#tool-args-match-modes).
345
+
346
+ ### Tool args match modes
347
+
348
+ When checking equality between tool calls, the above evaluators will require that all tool call arguments are the exact same by default. You can configure this behavior in the following ways:
349
+
350
+ - Treating any two tool calls for the same tool as equivalent by setting `tool_args_match_mode="ignore"` (Python) or `toolArgsMatchMode: "ignore"` (TypeScript)
351
+ - Treating a tool call as equivalent if it contain as subset/superset of args compared to a reference tool call of the same name with `tool_args_match_mode="subset"/"superset"` (Python) or `toolArgsMatchMode: "subset"/"superset` (TypeScript)
352
+ - Setting custom matchers for all calls of a given tool using the `tool_args_match_overrides` (Python) or `toolArgsMatchOverrides` (TypeScript) param
353
+
354
+ You can set both of these parameters at the same time. `tool_args_match_overrides` will take precendence over `tool_args_match_mode`.
355
+
356
+ `tool_args_match_overrides`/`toolArgsMatchOverrides` takes a dictionary whose keys are tool names and whose values are either `"exact"`, `"ignore"`, a list of fields within the tool call that must match exactly, or a comparator function that takes two arguments and returns whether they are equal:
357
+
358
+ ```python
359
+ ToolArgsMatchMode = Literal["exact", "ignore", "subset", "superset"]
360
+
361
+ ToolArgsMatchOverrides = dict[str, Union[ToolArgsMatchMode, list[str], Callable[[dict, dict], bool]]]
362
+ ```
363
+
364
+ Here's an example that allows case insensitivity for the arguments to a tool named `get_weather`:
365
+
366
+ ```ts
367
+ import { createTrajectoryMatchEvaluator } from "agentevals";
368
+
369
+ const outputs = [
370
+ { role: "user", content: "What is the weather in SF?" },
371
+ {
372
+ role: "assistant",
373
+ content: "",
374
+ tool_calls: [{
375
+ function: {
376
+ name: "get_weather",
377
+ arguments: JSON.stringify({ city: "san francisco" })
378
+ },
379
+ }]
380
+ },
381
+ { role: "tool", content: "It's 80 degrees and sunny in SF." },
382
+ { role: "assistant", content: "The weather in SF is 80 degrees and sunny." },
383
+ ];
384
+
385
+ const referenceOutputs = [
386
+ { role: "user", content: "What is the weather in San Francisco?" },
387
+ {
388
+ role: "assistant",
389
+ content: "",
390
+ tool_calls: [{
391
+ function: {
392
+ name: "get_weather",
393
+ arguments: JSON.stringify({ city: "San Francisco" })
394
+ }
395
+ }]
396
+ },
397
+ { role: "tool", content: "It's 80 degrees and sunny in San Francisco." },
398
+ ];
398
399
 
399
- #### Trajectory LLM-as-judge
400
+ const evaluator = createTrajectoryMatchEvaluator({
401
+ trajectoryMatchMode: "strict",
402
+ toolArgsMatchMode: "exact", // Default value
403
+ toolArgsMatchOverrides: {
404
+ get_weather: (x, y) => {
405
+ return typeof x.city === "string" &&
406
+ typeof y.city === "string" &&
407
+ x.city.toLowerCase() === y.city.toLowerCase();
408
+ },
409
+ }
410
+ });
400
411
 
401
- The LLM-as-judge trajectory evaluator that uses an LLM to evaluate the trajectory. Unlike the other trajectory evaluators, it doesn't require a reference trajectory,
402
- and supports
403
- This allows for more flexibility in the trajectory comparison:
412
+ const result = await evaluator({
413
+ outputs,
414
+ referenceOutputs,
415
+ });
416
+
417
+ console.log(result);
418
+ ```
419
+
420
+ ```
421
+ {
422
+ 'key': 'trajectory_strict_match',
423
+ 'score': true,
424
+ }
425
+ ```
426
+
427
+ This flexibility allows you to handle cases where you want looser equality for LLM generated arguments (`"san francisco"` to equal `"San Francisco"`) for only specific tool calls.
428
+
429
+ ## Trajectory LLM-as-judge
430
+
431
+ The LLM-as-judge trajectory evaluator that uses an LLM to evaluate the trajectory. Unlike the trajectory match evaluators, it doesn't require a reference trajectory. Here's an example:
432
+
433
+ ```ts
434
+ import {
435
+ createTrajectoryLLMAsJudge,
436
+ TRAJECTORY_ACCURACY_PROMPT,
437
+ } from "agentevals";
438
+
439
+ const evaluator = createTrajectoryLLMAsJudge({
440
+ prompt: TRAJECTORY_ACCURACY_PROMPT,
441
+ model: "openai:o3-mini",
442
+ });
443
+
444
+ const outputs = [
445
+ {role: "user", content: "What is the weather in SF?"},
446
+ {
447
+ role: "assistant",
448
+ content: "",
449
+ tool_calls: [
450
+ {
451
+ function: {
452
+ name: "get_weather",
453
+ arguments: JSON.stringify({ city: "SF" }),
454
+ }
455
+ }
456
+ ],
457
+ },
458
+ {role: "tool", content: "It's 80 degrees and sunny in SF."},
459
+ {role: "assistant", content: "The weather in SF is 80 degrees and sunny."},
460
+ ];
461
+
462
+ const result = await evaluator({ outputs });
463
+
464
+ console.log(result)
465
+ ```
466
+
467
+ ```
468
+ {
469
+ 'key': 'trajectory_accuracy',
470
+ 'score': True,
471
+ 'comment': 'The provided agent trajectory is reasonable...'
472
+ }
473
+ ```
474
+
475
+ If you have a reference trajectory, you can add an extra variable to your prompt and pass in the reference trajectory. Below, we use the prebuilt `TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE` prompt, which contains a `reference_outputs` variable:
404
476
 
405
477
  ```ts
406
478
  import {
@@ -417,6 +489,7 @@ const outputs = [
417
489
  {role: "user", content: "What is the weather in SF?"},
418
490
  {
419
491
  role: "assistant",
492
+ content: "",
420
493
  tool_calls: [
421
494
  {
422
495
  function: {
@@ -433,6 +506,7 @@ const referenceOutputs = [
433
506
  {role: "user", content: "What is the weather in SF?"},
434
507
  {
435
508
  role: "assistant",
509
+ content: "",
436
510
  tool_calls: [
437
511
  {
438
512
  function: {
@@ -484,7 +558,7 @@ const fewShotExamples = [
484
558
 
485
559
  See the [`openevals`](https://github.com/langchain-ai/openevals?tab=readme-ov-file#llm-as-judge) repo for a fully up to date list of parameters.
486
560
 
487
- ### Graph trajectory
561
+ ## Graph trajectory
488
562
 
489
563
  For frameworks like [LangGraph](https://github.com/langchain-ai/langgraph) that model agents as graphs, it can be more convenient to represent trajectories in terms of nodes visited rather than messages. `agentevals` includes a category of evaluators called **graph trajectory** evaluators that are designed to work with this format, as well as convenient utilities for extracting trajectories from a LangGraph thread, including different conversation turns and interrupts.
490
564
 
@@ -509,7 +583,7 @@ const evaluator: ({ inputs, outputs, referenceOutputs, ...extra }: {
509
583
 
510
584
  Where `inputs` is a list of inputs (or a dict with a key named `"inputs"`) to the graph whose items each represent the start of a new invocation in a thread, `results` representing the final output from each turn in the thread, and `steps` representing the internal steps taken for each turn.
511
585
 
512
- #### Graph trajectory LLM-as-judge
586
+ ### Graph trajectory LLM-as-judge
513
587
 
514
588
  This evaluator is similar to the `trajectory_llm_as_judge` evaluator, but it works with graph trajectories instead of message trajectories. Below, we set up a LangGraph agent, extract a trajectory from it using the built-in utils, and pass it to the evaluator. First, let's setup our graph, call it, and then extract the trajectory:
515
589
 
@@ -658,9 +732,9 @@ res = await graphTrajectoryEvaluator(
658
732
 
659
733
  In order to format them properly into the prompt, `reference_outputs` should be passed in as a `GraphTrajectory` object like `outputs`.
660
734
 
661
- Also note that like other LLM-as-judge evaluators, you can pass extra kwargs into the evaluator to format them into the prompt.
735
+ Also note that like other LLM-as-judge evaluators, you can pass extra params into the evaluator to format them into the prompt.
662
736
 
663
- #### Graph trajectory strict match
737
+ ### Graph trajectory strict match
664
738
 
665
739
  The `graph_trajectory_strict_match` evaluator is a simple evaluator that checks if the steps in the provided graph trajectory match the reference trajectory exactly.
666
740
 
@@ -732,18 +806,47 @@ console.log(result);
732
806
  }
733
807
  ```
734
808
 
735
- ## LangSmith Integration
809
+ # Python Async Support
810
+
811
+ All `agentevals` evaluators support Python [asyncio](https://docs.python.org/3/library/asyncio.html). As a convention, evaluators that use a factory function will have `async` put immediately after `create_` in the function name (for example, `create_async_trajectory_llm_as_judge`), and evaluators used directly will end in `async` (e.g. `trajectory_strict_match_async`).
812
+
813
+ Here's an example of how to use the `create_async_llm_as_judge` evaluator asynchronously:
814
+
815
+ ```python
816
+ from agentevals.trajectory.llm import create_async_trajectory_llm_as_judge
817
+
818
+ evaluator = create_async_llm_as_judge(
819
+ prompt="What is the weather in {inputs}?",
820
+ )
821
+
822
+ result = await evaluator(inputs="San Francisco")
823
+ ```
824
+
825
+ If you are using the OpenAI client directly, remember to pass in `AsyncOpenAI` as the `judge` parameter:
826
+
827
+ ```python
828
+ from openai import AsyncOpenAI
829
+
830
+ evaluator = create_async_llm_as_judge(
831
+ prompt="What is the weather in {inputs}?",
832
+ judge=AsyncOpenAI(),
833
+ model="o3-mini",
834
+ )
835
+
836
+ result = await evaluator(inputs="San Francisco")
837
+ ```
838
+
839
+ # LangSmith Integration
736
840
 
737
841
  For tracking experiments over time, you can log evaluator results to [LangSmith](https://smith.langchain.com/), a platform for building production-grade LLM applications that includes tracing, evaluation, and experimentation tools.
738
842
 
739
843
  LangSmith currently offers two ways to run evals: a [pytest](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest) (Python) or [Vitest/Jest](https://docs.smith.langchain.com/evaluation/how_to_guides/vitest_jest) integration and the `evaluate` function. We'll give a quick example of how to run evals using both.
740
844
 
741
- ### Pytest or Vitest/Jest
845
+ ## Pytest or Vitest/Jest
742
846
 
743
847
  First, follow [these instructions](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest) to set up LangSmith's pytest runner, or these to set up [Vitest or Jest](https://docs.smith.langchain.com/evaluation/how_to_guides/vitest_jest),
744
848
  setting appropriate environment variables:
745
849
 
746
-
747
850
  ```bash
748
851
  export LANGSMITH_API_KEY="your_langsmith_api_key"
749
852
  export LANGSMITH_TRACING="true"
@@ -776,6 +879,7 @@ ls.describe("trajectory accuracy", () => {
776
879
  {"role": "user", "content": "What is the weather in SF?"},
777
880
  {
778
881
  "role": "assistant",
882
+ "content": "",
779
883
  "tool_calls": [
780
884
  {
781
885
  "function": {
@@ -794,6 +898,7 @@ ls.describe("trajectory accuracy", () => {
794
898
  {"role": "user", "content": "What is the weather in SF?"},
795
899
  {
796
900
  "role": "assistant",
901
+ "content": "",
797
902
  "tool_calls": [
798
903
  {
799
904
  "function": {
@@ -831,7 +936,7 @@ And you should also see the results in the experiment view in LangSmith:
831
936
 
832
937
  ![LangSmith results](/static/img/langsmith_results.png)
833
938
 
834
- ### Evaluate
939
+ ## Evaluate
835
940
 
836
941
  Alternatively, you can [create a dataset in LangSmith](https://docs.smith.langchain.com/evaluation/concepts#dataset-curation) and use your created evaluators with LangSmith's [`evaluate`](https://docs.smith.langchain.com/evaluation#8-run-and-view-results) function:
837
942
 
@@ -846,20 +951,21 @@ const trajectoryEvaluator = createTrajectoryLLMAsJudge({
846
951
 
847
952
  await evaluate(
848
953
  (inputs) => [
849
- {role: "user", content: "What is the weather in SF?"},
850
- {
851
- role: "assistant",
852
- tool_calls: [
853
- {
854
- function: {
855
- name: "get_weather",
856
- arguments: json.dumps({"city": "SF"}),
857
- }
858
- }
859
- ],
860
- },
861
- {role: "tool", content: "It's 80 degrees and sunny in SF."},
862
- {role: "assistant", content: "The weather in SF is 80 degrees and sunny."},
954
+ {role: "user", content: "What is the weather in SF?"},
955
+ {
956
+ role: "assistant",
957
+ content: "",
958
+ tool_calls: [
959
+ {
960
+ function: {
961
+ name: "get_weather",
962
+ arguments: json.dumps({"city": "SF"}),
963
+ }
964
+ }
965
+ ],
966
+ },
967
+ {role: "tool", content: "It's 80 degrees and sunny in SF."},
968
+ {role: "assistant", content: "The weather in SF is 80 degrees and sunny."},
863
969
  ],
864
970
  {
865
971
  data: datasetName,
@@ -868,7 +974,7 @@ await evaluate(
868
974
  );
869
975
  ```
870
976
 
871
- ## Thank you!
977
+ # Thank you!
872
978
 
873
979
  We hope that `agentevals` helps make evaluating your LLM agents easier!
874
980
 
@@ -88,10 +88,24 @@ function _exactMatch(toolCall, referenceToolCall) {
88
88
  function _ignoreMatch(_toolCall, _referenceToolCall) {
89
89
  return true;
90
90
  }
91
+ function _subsetMatch(toolCall, referenceToolCall) {
92
+ // Every key-value pair in toolCall must exist in referenceToolCall with the same value
93
+ return Object.entries(toolCall).every(([key, value]) => key in referenceToolCall && _deepEqual(referenceToolCall[key], value));
94
+ }
95
+ function _supersetMatch(toolCall, referenceToolCall) {
96
+ // Every key-value pair in referenceToolCall must exist in toolCall with the same value
97
+ return Object.entries(referenceToolCall).every(([key, value]) => key in toolCall && _deepEqual(toolCall[key], value));
98
+ }
91
99
  function _getMatcherForComparisonMode(mode) {
92
100
  if (mode === "exact") {
93
101
  return _exactMatch;
94
102
  }
103
+ else if (mode === "subset") {
104
+ return _subsetMatch;
105
+ }
106
+ else if (mode === "superset") {
107
+ return _supersetMatch;
108
+ }
95
109
  else {
96
110
  return _ignoreMatch;
97
111
  }
@@ -84,10 +84,24 @@ function _exactMatch(toolCall, referenceToolCall) {
84
84
  function _ignoreMatch(_toolCall, _referenceToolCall) {
85
85
  return true;
86
86
  }
87
+ function _subsetMatch(toolCall, referenceToolCall) {
88
+ // Every key-value pair in toolCall must exist in referenceToolCall with the same value
89
+ return Object.entries(toolCall).every(([key, value]) => key in referenceToolCall && _deepEqual(referenceToolCall[key], value));
90
+ }
91
+ function _supersetMatch(toolCall, referenceToolCall) {
92
+ // Every key-value pair in referenceToolCall must exist in toolCall with the same value
93
+ return Object.entries(referenceToolCall).every(([key, value]) => key in toolCall && _deepEqual(toolCall[key], value));
94
+ }
87
95
  function _getMatcherForComparisonMode(mode) {
88
96
  if (mode === "exact") {
89
97
  return _exactMatch;
90
98
  }
99
+ else if (mode === "subset") {
100
+ return _subsetMatch;
101
+ }
102
+ else if (mode === "superset") {
103
+ return _supersetMatch;
104
+ }
91
105
  else {
92
106
  return _ignoreMatch;
93
107
  }
package/dist/types.d.ts CHANGED
@@ -12,6 +12,6 @@ export type ExtractedLangGraphThreadTrajectory = {
12
12
  export type TrajectoryLLMAsJudgeParams = Omit<Parameters<typeof createLLMAsJudge>[0], "prompt"> & {
13
13
  prompt?: string;
14
14
  };
15
- export type ToolArgsMatchMode = "exact" | "ignore";
15
+ export type ToolArgsMatchMode = "exact" | "ignore" | "subset" | "superset";
16
16
  export type ToolArgsMatcher = (toolCall: Record<string, unknown>, referenceToolCall: Record<string, unknown>) => boolean | Promise<boolean>;
17
17
  export type ToolArgsMatchOverrides = Record<string, ToolArgsMatchMode | string[] | ToolArgsMatcher>;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentevals",
3
- "version": "0.0.4",
3
+ "version": "0.0.5",
4
4
  "packageManager": "yarn@3.5.1",
5
5
  "type": "module",
6
6
  "scripts": {