agentevals 0.0.1-rc.3 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +158 -631
- package/dist/utils.cjs +2 -35
- package/dist/utils.d.ts +2 -3
- package/dist/utils.js +2 -35
- package/package.json +3 -3
package/README.md
CHANGED
|
@@ -13,21 +13,9 @@ If you are looking for more general evaluation tools, please check out the compa
|
|
|
13
13
|
|
|
14
14
|
To get started, install `agentevals`:
|
|
15
15
|
|
|
16
|
-
<details>
|
|
17
|
-
<summary>Python</summary>
|
|
18
|
-
|
|
19
|
-
```bash
|
|
20
|
-
pip install agentevals
|
|
21
|
-
```
|
|
22
|
-
</details>
|
|
23
|
-
|
|
24
|
-
<details open>
|
|
25
|
-
<summary>TypeScript</summary>
|
|
26
|
-
|
|
27
16
|
```bash
|
|
28
17
|
npm install agentevals @langchain/core
|
|
29
18
|
```
|
|
30
|
-
</details>
|
|
31
19
|
|
|
32
20
|
This quickstart will use an evaluator powered by OpenAI's `o3-mini` model to judge your results, so you'll need to set your OpenAI API key as an environment variable:
|
|
33
21
|
|
|
@@ -37,54 +25,6 @@ export OPENAI_API_KEY="your_openai_api_key"
|
|
|
37
25
|
|
|
38
26
|
Once you've done this, you can run your first trajectory evaluator. We represent the agent's trajectory as a list of OpenAI-style messages:
|
|
39
27
|
|
|
40
|
-
<details>
|
|
41
|
-
<summary>Python</summary>
|
|
42
|
-
|
|
43
|
-
```python
|
|
44
|
-
from agentevals.trajectory.llm import create_trajectory_llm_as_judge, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE
|
|
45
|
-
|
|
46
|
-
trajectory_evaluator = create_trajectory_llm_as_judge(
|
|
47
|
-
prompt=TRAJECTORY_ACCURACY_PROMPT,
|
|
48
|
-
model="openai:o3-mini",
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
# This is a fake trajectory, in reality you would run your agent to get a real trajectory
|
|
52
|
-
outputs = [
|
|
53
|
-
{"role": "user", "content": "What is the weather in SF?"},
|
|
54
|
-
{
|
|
55
|
-
"role": "assistant",
|
|
56
|
-
"tool_calls": [
|
|
57
|
-
{
|
|
58
|
-
"function": {
|
|
59
|
-
"name": "get_weather",
|
|
60
|
-
"arguments": json.dumps({"city": "SF"}),
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
],
|
|
64
|
-
},
|
|
65
|
-
{"role": "tool", "content": "It's 80 degrees and sunny in SF."},
|
|
66
|
-
{"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
|
|
67
|
-
]
|
|
68
|
-
|
|
69
|
-
eval_result = trajectory_evaluator(
|
|
70
|
-
outputs=outputs,
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
print(eval_result)
|
|
74
|
-
```
|
|
75
|
-
|
|
76
|
-
```
|
|
77
|
-
{
|
|
78
|
-
'key': 'trajectory_accuracy',
|
|
79
|
-
'reasoning': 'The trajectory accurately follows the user's request for weather information in SF. Initially, the assistant recognizes the goal (providing weather details), then it efficiently makes a tool call to get the weather, and finally it communicates the result clearly. All steps demonstrate logical progression and efficiency. Thus, the score should be: true.',
|
|
80
|
-
'score': true
|
|
81
|
-
}
|
|
82
|
-
```
|
|
83
|
-
</details>
|
|
84
|
-
|
|
85
|
-
<details open>
|
|
86
|
-
<summary>TypeScript</summary>
|
|
87
|
-
|
|
88
28
|
```ts
|
|
89
29
|
import {
|
|
90
30
|
createTrajectoryLLMAsJudge,
|
|
@@ -131,7 +71,6 @@ console.log(evalResult);
|
|
|
131
71
|
comment: '...'
|
|
132
72
|
}
|
|
133
73
|
```
|
|
134
|
-
</details>
|
|
135
74
|
|
|
136
75
|
You can see that despite the small difference in the final response and tool calls, the evaluator still returns a score of `true` since the overall trajectory is the same between the output and reference!
|
|
137
76
|
|
|
@@ -156,42 +95,18 @@ You can see that despite the small difference in the final response and tool cal
|
|
|
156
95
|
|
|
157
96
|
You can install `agentevals` like this:
|
|
158
97
|
|
|
159
|
-
<details>
|
|
160
|
-
<summary>Python</summary>
|
|
161
|
-
|
|
162
|
-
```bash
|
|
163
|
-
pip install agentevals
|
|
164
|
-
```
|
|
165
|
-
</details>
|
|
166
|
-
|
|
167
|
-
<details open>
|
|
168
|
-
<summary>TypeScript</summary>
|
|
169
|
-
|
|
170
98
|
```bash
|
|
171
99
|
npm install agentevals @langchain/core
|
|
172
100
|
```
|
|
173
|
-
</details>
|
|
174
101
|
|
|
175
102
|
For LLM-as-judge evaluators, you will also need an LLM client. By default, `agentevals` will use [LangChain chat model integrations](https://python.langchain.com/docs/integrations/chat/) and comes with `langchain_openai` installed by default. However, if you prefer, you may use the OpenAI client directly:
|
|
176
103
|
|
|
177
|
-
<details>
|
|
178
|
-
<summary>Python</summary>
|
|
179
|
-
|
|
180
|
-
```bash
|
|
181
|
-
pip install openai
|
|
182
|
-
```
|
|
183
|
-
</details>
|
|
184
|
-
|
|
185
|
-
<details open>
|
|
186
|
-
<summary>TypeScript</summary>
|
|
187
|
-
|
|
188
104
|
```bash
|
|
189
105
|
npm install openai
|
|
190
106
|
```
|
|
191
|
-
</details>
|
|
192
107
|
|
|
193
108
|
It is also helpful to be familiar with some [evaluation concepts](https://docs.smith.langchain.com/evaluation/concepts) and
|
|
194
|
-
LangSmith's
|
|
109
|
+
LangSmith's Vitest/Jest integration for running evals, which is documented [here](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest).
|
|
195
110
|
|
|
196
111
|
## Evaluators
|
|
197
112
|
|
|
@@ -207,64 +122,6 @@ The `trajectory_strict_match` evaluator, compares two trajectories and ensures t
|
|
|
207
122
|
in the same order with the same tool calls. It allows for differences in message content and tool call arguments,
|
|
208
123
|
but requires that the selected tools at each step are the same.
|
|
209
124
|
|
|
210
|
-
<details>
|
|
211
|
-
<summary>Python</summary>
|
|
212
|
-
|
|
213
|
-
```python
|
|
214
|
-
import json
|
|
215
|
-
from agentevals.trajectory.strict import trajectory_strict_match
|
|
216
|
-
|
|
217
|
-
outputs = [
|
|
218
|
-
{"role": "user", "content": "What is the weather in SF?"},
|
|
219
|
-
{
|
|
220
|
-
"role": "assistant",
|
|
221
|
-
"tool_calls": [
|
|
222
|
-
{
|
|
223
|
-
"function": {
|
|
224
|
-
"name": "get_weather",
|
|
225
|
-
"arguments": json.dumps({"city": "SF"}),
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
],
|
|
229
|
-
},
|
|
230
|
-
{"role": "tool", "content": "It's 80 degrees and sunny in SF."},
|
|
231
|
-
{"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
|
|
232
|
-
]
|
|
233
|
-
reference_outputs = [
|
|
234
|
-
{"role": "user", "content": "What is the weather in San Francisco?"},
|
|
235
|
-
{
|
|
236
|
-
"role": "assistant",
|
|
237
|
-
"tool_calls": [
|
|
238
|
-
{
|
|
239
|
-
"function": {
|
|
240
|
-
"name": "get_weather",
|
|
241
|
-
"arguments": json.dumps({"city": "San Francisco"}),
|
|
242
|
-
}
|
|
243
|
-
}
|
|
244
|
-
],
|
|
245
|
-
},
|
|
246
|
-
{"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."},
|
|
247
|
-
{"role": "assistant", "content": "The weather in SF is 80˚ and sunny."},
|
|
248
|
-
]
|
|
249
|
-
result = trajectory_strict_match(
|
|
250
|
-
outputs=outputs, reference_outputs=reference_outputs
|
|
251
|
-
)
|
|
252
|
-
|
|
253
|
-
print(result)
|
|
254
|
-
```
|
|
255
|
-
|
|
256
|
-
```
|
|
257
|
-
{
|
|
258
|
-
'key': 'trajectory_accuracy',
|
|
259
|
-
'score': True,
|
|
260
|
-
'comment': None,
|
|
261
|
-
}
|
|
262
|
-
```
|
|
263
|
-
</details>
|
|
264
|
-
|
|
265
|
-
<details open>
|
|
266
|
-
<summary>TypeScript</summary>
|
|
267
|
-
|
|
268
125
|
```ts
|
|
269
126
|
import { trajectoryStrictMatch } from "agentevals";
|
|
270
127
|
|
|
@@ -300,86 +157,11 @@ console.log(result);
|
|
|
300
157
|
'score': true,
|
|
301
158
|
}
|
|
302
159
|
```
|
|
303
|
-
</details>
|
|
304
160
|
|
|
305
161
|
#### Unordered match
|
|
306
162
|
|
|
307
163
|
The `trajectory_unordered_match` evaluator, compares two trajectories and ensures that they contain the same number of tool calls in any order. This is useful if you want to allow flexibility in how an agent obtains the proper information, but still do care that all information was retrieved.
|
|
308
164
|
|
|
309
|
-
<details>
|
|
310
|
-
<summary>Python</summary>
|
|
311
|
-
|
|
312
|
-
```python
|
|
313
|
-
import json
|
|
314
|
-
from agentevals.trajectory.unordered import trajectory_unordered_match
|
|
315
|
-
|
|
316
|
-
inputs = {}
|
|
317
|
-
outputs = [
|
|
318
|
-
{"role": "user", "content": "What is the weather in SF and is there anything fun happening?"},
|
|
319
|
-
{
|
|
320
|
-
"role": "assistant",
|
|
321
|
-
"tool_calls": [{
|
|
322
|
-
"function": {
|
|
323
|
-
"name": "get_weather",
|
|
324
|
-
"arguments": json.dumps({"city": "SF"}),
|
|
325
|
-
}
|
|
326
|
-
}],
|
|
327
|
-
},
|
|
328
|
-
{"role": "tool", "content": "It's 80 degrees and sunny in SF."},
|
|
329
|
-
{
|
|
330
|
-
"role": "assistant",
|
|
331
|
-
"tool_calls": [{
|
|
332
|
-
"function": {
|
|
333
|
-
"name": "get_fun_activities",
|
|
334
|
-
"arguments": json.dumps({"city": "SF"}),
|
|
335
|
-
}
|
|
336
|
-
}],
|
|
337
|
-
},
|
|
338
|
-
{"role": "tool", "content": "Nothing fun is happening, you should stay indoors and read!"},
|
|
339
|
-
{"role": "assistant", "content": "The weather in SF is 80 degrees and sunny, but there is nothing fun happening."},
|
|
340
|
-
]
|
|
341
|
-
reference_outputs = [
|
|
342
|
-
{"role": "user", "content": "What is the weather in SF and is there anything fun happening?"},
|
|
343
|
-
{
|
|
344
|
-
"role": "assistant",
|
|
345
|
-
"tool_calls": [
|
|
346
|
-
{
|
|
347
|
-
"function": {
|
|
348
|
-
"name": "get_fun_activities",
|
|
349
|
-
"arguments": json.dumps({"city": "San Francisco"}),
|
|
350
|
-
}
|
|
351
|
-
},
|
|
352
|
-
{
|
|
353
|
-
"function": {
|
|
354
|
-
"name": "get_weather",
|
|
355
|
-
"arguments": json.dumps({"city": "San Francisco"}),
|
|
356
|
-
}
|
|
357
|
-
},
|
|
358
|
-
],
|
|
359
|
-
},
|
|
360
|
-
{"role": "tool", "content": "Nothing fun is happening, you should stay indoors and read!"},
|
|
361
|
-
{"role": "tool", "content": "It's 80 degrees and sunny in SF."},
|
|
362
|
-
{"role": "assistant", "content": "In SF, it's 80˚ and sunny, but there is nothing fun happening."},
|
|
363
|
-
]
|
|
364
|
-
result = trajectory_unordered_match(
|
|
365
|
-
outputs=outputs, reference_outputs=reference_outputs
|
|
366
|
-
)
|
|
367
|
-
|
|
368
|
-
print(result)
|
|
369
|
-
```
|
|
370
|
-
|
|
371
|
-
```
|
|
372
|
-
{
|
|
373
|
-
'key': 'trajectory_unordered_match',
|
|
374
|
-
'score': True,
|
|
375
|
-
'comment': None,
|
|
376
|
-
}
|
|
377
|
-
```
|
|
378
|
-
</details>
|
|
379
|
-
|
|
380
|
-
<details open>
|
|
381
|
-
<summary>TypeScript</summary>
|
|
382
|
-
|
|
383
165
|
```ts
|
|
384
166
|
import { trajectoryUnorderedMatch } from "agentevals";
|
|
385
167
|
|
|
@@ -446,77 +228,11 @@ console.log(result)
|
|
|
446
228
|
'score': true,
|
|
447
229
|
}
|
|
448
230
|
```
|
|
449
|
-
</details>
|
|
450
231
|
|
|
451
232
|
#### Subset and superset match
|
|
452
233
|
|
|
453
234
|
There are other evaluators for checking partial trajectory matches (ensuring that a trajectory contains a subset and superset of tool calls compared to a reference trajectory).
|
|
454
235
|
|
|
455
|
-
<details>
|
|
456
|
-
<summary>Python</summary>
|
|
457
|
-
|
|
458
|
-
```python
|
|
459
|
-
import json
|
|
460
|
-
from openevals.trajectory.subset import trajectory_subset
|
|
461
|
-
# from openevals.trajectory.superset import trajectory_superset
|
|
462
|
-
|
|
463
|
-
outputs = [
|
|
464
|
-
{"role": "user", "content": "What is the weather in SF and London?"},
|
|
465
|
-
{
|
|
466
|
-
"role": "assistant",
|
|
467
|
-
"tool_calls": [{
|
|
468
|
-
"function": {
|
|
469
|
-
"name": "get_weather",
|
|
470
|
-
"arguments": json.dumps({"city": "SF and London"}),
|
|
471
|
-
}
|
|
472
|
-
}],
|
|
473
|
-
},
|
|
474
|
-
{"role": "tool", "content": "It's 80 degrees and sunny in SF, and 90 degrees and rainy in London."},
|
|
475
|
-
{"role": "assistant", "content": "The weather in SF is 80 degrees and sunny. In London, it's 90 degrees and rainy."},
|
|
476
|
-
]
|
|
477
|
-
reference_outputs = [
|
|
478
|
-
{"role": "user", "content": "What is the weather in SF and London?"},
|
|
479
|
-
{
|
|
480
|
-
"role": "assistant",
|
|
481
|
-
"tool_calls": [
|
|
482
|
-
{
|
|
483
|
-
"function": {
|
|
484
|
-
"name": "get_weather",
|
|
485
|
-
"arguments": json.dumps({"city": "San Francisco"}),
|
|
486
|
-
}
|
|
487
|
-
},
|
|
488
|
-
{
|
|
489
|
-
"function": {
|
|
490
|
-
"name": "get_weather",
|
|
491
|
-
"arguments": json.dumps({"city": "London"}),
|
|
492
|
-
}
|
|
493
|
-
},
|
|
494
|
-
],
|
|
495
|
-
},
|
|
496
|
-
{"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."},
|
|
497
|
-
{"role": "tool", "content": "It's 90 degrees and rainy in London."},
|
|
498
|
-
{"role": "assistant", "content": "The weather in SF is 80˚ and sunny. In London, it's 90˚ and rainy."},
|
|
499
|
-
]
|
|
500
|
-
|
|
501
|
-
result = trajectory_subset(
|
|
502
|
-
outputs=outputs, reference_outputs=reference_outputs
|
|
503
|
-
)
|
|
504
|
-
|
|
505
|
-
print(result)
|
|
506
|
-
```
|
|
507
|
-
|
|
508
|
-
```
|
|
509
|
-
{
|
|
510
|
-
'key': 'trajectory_subset',
|
|
511
|
-
'score': True,
|
|
512
|
-
'comment': None,
|
|
513
|
-
}
|
|
514
|
-
```
|
|
515
|
-
</details>
|
|
516
|
-
|
|
517
|
-
<details open>
|
|
518
|
-
<summary>TypeScript</summary>
|
|
519
|
-
|
|
520
236
|
```ts
|
|
521
237
|
import { trajectorySubset } from "agentevals";
|
|
522
238
|
// import { trajectorySuperset } from "agentevals";
|
|
@@ -574,7 +290,6 @@ console.log(result)
|
|
|
574
290
|
'score': true,
|
|
575
291
|
}
|
|
576
292
|
```
|
|
577
|
-
</details>
|
|
578
293
|
|
|
579
294
|
#### Trajectory LLM-as-judge
|
|
580
295
|
|
|
@@ -582,69 +297,6 @@ The LLM-as-judge trajectory evaluator that uses an LLM to evaluate the trajector
|
|
|
582
297
|
and supports
|
|
583
298
|
This allows for more flexibility in the trajectory comparison:
|
|
584
299
|
|
|
585
|
-
<details>
|
|
586
|
-
<summary>Python</summary>
|
|
587
|
-
|
|
588
|
-
```python
|
|
589
|
-
import json
|
|
590
|
-
from openevals.trajectory.llm import create_trajectory_llm_as_judge, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE
|
|
591
|
-
|
|
592
|
-
evaluator = create_trajectory_llm_as_judge(
|
|
593
|
-
prompt=TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,
|
|
594
|
-
model="openai:o3-mini"
|
|
595
|
-
)
|
|
596
|
-
outputs = [
|
|
597
|
-
{"role": "user", "content": "What is the weather in SF?"},
|
|
598
|
-
{
|
|
599
|
-
"role": "assistant",
|
|
600
|
-
"tool_calls": [
|
|
601
|
-
{
|
|
602
|
-
"function": {
|
|
603
|
-
"name": "get_weather",
|
|
604
|
-
"arguments": json.dumps({"city": "SF"}),
|
|
605
|
-
}
|
|
606
|
-
}
|
|
607
|
-
],
|
|
608
|
-
},
|
|
609
|
-
{"role": "tool", "content": "It's 80 degrees and sunny in SF."},
|
|
610
|
-
{"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
|
|
611
|
-
]
|
|
612
|
-
reference_outputs = [
|
|
613
|
-
{"role": "user", "content": "What is the weather in SF?"},
|
|
614
|
-
{
|
|
615
|
-
"role": "assistant",
|
|
616
|
-
"tool_calls": [
|
|
617
|
-
{
|
|
618
|
-
"function": {
|
|
619
|
-
"name": "get_weather",
|
|
620
|
-
"arguments": json.dumps({"city": "San Francisco"}),
|
|
621
|
-
}
|
|
622
|
-
}
|
|
623
|
-
],
|
|
624
|
-
},
|
|
625
|
-
{"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."},
|
|
626
|
-
{"role": "assistant", "content": "The weather in SF is 80˚ and sunny."},
|
|
627
|
-
]
|
|
628
|
-
eval_result = evaluator(
|
|
629
|
-
outputs=outputs,
|
|
630
|
-
reference_outputs=reference_outputs,
|
|
631
|
-
)
|
|
632
|
-
|
|
633
|
-
print(eval_result)
|
|
634
|
-
```
|
|
635
|
-
|
|
636
|
-
```
|
|
637
|
-
{
|
|
638
|
-
'key': 'trajectory_accuracy',
|
|
639
|
-
'score': True,
|
|
640
|
-
'comment': 'The provided agent trajectory is consistent with the reference. Both trajectories start with the same user query and then correctly invoke a weather lookup through a tool call. Although the reference uses "San Francisco" while the provided trajectory uses "SF" and there is a minor formatting difference (degrees vs. ˚), these differences do not affect the correctness or essential steps of the process. Thus, the score should be: true.'
|
|
641
|
-
}
|
|
642
|
-
```
|
|
643
|
-
</details>
|
|
644
|
-
|
|
645
|
-
<details open>
|
|
646
|
-
<summary>TypeScript</summary>
|
|
647
|
-
|
|
648
300
|
```ts
|
|
649
301
|
import {
|
|
650
302
|
createTrajectoryLLMAsJudge,
|
|
@@ -704,7 +356,6 @@ console.log(result)
|
|
|
704
356
|
'comment': 'The provided agent trajectory is consistent with the reference. Both trajectories start with the same user query and then correctly invoke a weather lookup through a tool call. Although the reference uses "San Francisco" while the provided trajectory uses "SF" and there is a minor formatting difference (degrees vs. ˚), these differences do not affect the correctness or essential steps of the process. Thus, the score should be: true.'
|
|
705
357
|
}
|
|
706
358
|
```
|
|
707
|
-
</details>
|
|
708
359
|
|
|
709
360
|
`create_trajectory_llm_as_judge` takes the same parameters as [`create_llm_as_judge`](https://github.com/langchain-ai/openevals?tab=readme-ov-file#llm-as-judge) in `openevals`, so you can customize the prompt and scoring output as needed.
|
|
710
361
|
|
|
@@ -715,25 +366,6 @@ In addition to `prompt` and `model`, the following parameters are also available
|
|
|
715
366
|
- `system`: a string that sets a system prompt for the judge model by adding a system message before other parts of the prompt.
|
|
716
367
|
- `few_shot_examples`: a list of example dicts that are appended to the end of the prompt. This is useful for providing the judge model with examples of good and bad outputs. The required structure looks like this:
|
|
717
368
|
|
|
718
|
-
<details>
|
|
719
|
-
<summary>Python</summary>
|
|
720
|
-
|
|
721
|
-
```python
|
|
722
|
-
few_shot_examples = [
|
|
723
|
-
{
|
|
724
|
-
"inputs": "What color is the sky?",
|
|
725
|
-
"outputs": "The sky is red.",
|
|
726
|
-
"reasoning": "The sky is red because it is early evening.",
|
|
727
|
-
"score": 1,
|
|
728
|
-
}
|
|
729
|
-
]
|
|
730
|
-
```
|
|
731
|
-
|
|
732
|
-
</details>
|
|
733
|
-
|
|
734
|
-
<details open>
|
|
735
|
-
<summary>TypeScript</summary>
|
|
736
|
-
|
|
737
369
|
```ts
|
|
738
370
|
const fewShotExamples = [
|
|
739
371
|
{
|
|
@@ -744,7 +376,6 @@ const fewShotExamples = [
|
|
|
744
376
|
}
|
|
745
377
|
];
|
|
746
378
|
```
|
|
747
|
-
</details>
|
|
748
379
|
|
|
749
380
|
See the [`openevals`](https://github.com/langchain-ai/openevals?tab=readme-ov-file#llm-as-judge) repo for a fully up to date list of parameters.
|
|
750
381
|
|
|
@@ -754,70 +385,78 @@ For frameworks like [LangGraph](https://github.com/langchain-ai/langgraph) that
|
|
|
754
385
|
|
|
755
386
|
The below examples will use LangGraph with the built-in formatting utility, but graph evaluators accept input in the following general format:
|
|
756
387
|
|
|
757
|
-
```
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
388
|
+
```ts
|
|
389
|
+
export type GraphTrajectory = {
|
|
390
|
+
inputs?: (Record<string, unknown> | null)[];
|
|
391
|
+
results: Record<string, unknown>[];
|
|
392
|
+
steps: string[][];
|
|
393
|
+
};
|
|
394
|
+
|
|
395
|
+
const evaluator: ({ inputs, outputs, referenceOutputs, ...extra }: {
|
|
396
|
+
inputs: (string | Record<string, unknown> | null)[] | {
|
|
397
|
+
inputs: (string | Record<string, unknown> | null)[];
|
|
398
|
+
};
|
|
399
|
+
outputs: GraphTrajectory;
|
|
400
|
+
referenceOutputs?: GraphTrajectory;
|
|
401
|
+
[key: string]: unknown;
|
|
402
|
+
}) => ...
|
|
770
403
|
```
|
|
771
404
|
|
|
772
405
|
Where `inputs` is a list of inputs (or a dict with a key named `"inputs"`) to the graph whose items each represent the start of a new invocation in a thread, `results` representing the final output from each turn in the thread, and `steps` representing the internal steps taken for each turn.
|
|
773
406
|
|
|
774
407
|
#### Graph trajectory LLM-as-judge
|
|
775
408
|
|
|
776
|
-
This evaluator is similar to the `trajectory_llm_as_judge` evaluator, but it works with graph trajectories instead of message trajectories. Below, we set up a LangGraph agent, extract a trajectory from it using the built-in utils, and pass it to the evaluator:
|
|
777
|
-
|
|
778
|
-
```python
|
|
779
|
-
from agentevals.graph_trajectory.utils import (
|
|
780
|
-
extract_langgraph_trajectory_from_thread,
|
|
781
|
-
)
|
|
782
|
-
from agentevals.graph_trajectory.llm import create_graph_trajectory_llm_as_judge
|
|
783
|
-
|
|
784
|
-
from langgraph.prebuilt import create_react_agent
|
|
785
|
-
from langgraph.checkpoint.memory import MemorySaver
|
|
786
|
-
from langgraph.types import Command, interrupt
|
|
787
|
-
|
|
788
|
-
from langchain_core.tools import tool
|
|
789
|
-
|
|
790
|
-
@tool
|
|
791
|
-
def search(query: str):
|
|
792
|
-
"""Call to surf the web."""
|
|
793
|
-
user_answer = interrupt("Tell me the answer to the question.")
|
|
794
|
-
return user_answer
|
|
409
|
+
This evaluator is similar to the `trajectory_llm_as_judge` evaluator, but it works with graph trajectories instead of message trajectories. Below, we set up a LangGraph agent, extract a trajectory from it using the built-in utils, and pass it to the evaluator. First, let's setup our graph, call it, and then extract the trajectory:
|
|
795
410
|
|
|
796
|
-
|
|
411
|
+
```ts
|
|
412
|
+
import { tool } from "@langchain/core/tools";
|
|
413
|
+
import { ChatOpenAI } from "@langchain/openai";
|
|
414
|
+
import { createReactAgent } from "@langchain/langgraph/prebuilt";
|
|
415
|
+
import { MemorySaver, interrupt } from "@langchain/langgraph";
|
|
416
|
+
import { z } from "zod";
|
|
417
|
+
import { extractLangGraphTrajectoryFromThread } from "agentevals";
|
|
418
|
+
|
|
419
|
+
const search = tool((_): string => {
|
|
420
|
+
const userAnswer = interrupt("Tell me the answer to the question.")
|
|
421
|
+
return userAnswer;
|
|
422
|
+
}, {
|
|
423
|
+
name: "search",
|
|
424
|
+
description: "Call to surf the web.",
|
|
425
|
+
schema: z.object({
|
|
426
|
+
query: z.string()
|
|
427
|
+
})
|
|
428
|
+
})
|
|
429
|
+
|
|
430
|
+
const tools = [search];
|
|
431
|
+
|
|
432
|
+
// Create a checkpointer
|
|
433
|
+
const checkpointer = new MemorySaver();
|
|
434
|
+
|
|
435
|
+
// Create the React agent
|
|
436
|
+
const graph = createReactAgent({
|
|
437
|
+
llm: new ChatOpenAI({ model: "gpt-4o-mini" }),
|
|
438
|
+
tools,
|
|
439
|
+
checkpointer,
|
|
440
|
+
});
|
|
797
441
|
|
|
798
|
-
|
|
799
|
-
graph
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
)
|
|
442
|
+
// Invoke the graph with initial message
|
|
443
|
+
await graph.invoke(
|
|
444
|
+
{ messages: [{ role: "user", content: "what's the weather in sf?" }] },
|
|
445
|
+
{ configurable: { thread_id: "1" } }
|
|
446
|
+
);
|
|
804
447
|
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
graph.invoke(
|
|
811
|
-
Command(resume="It is rainy and 70 degrees!"),
|
|
812
|
-
config={"configurable": {"thread_id": "1"}},
|
|
813
|
-
)
|
|
448
|
+
// Resume the agent with a new command (simulating human-in-the-loop)
|
|
449
|
+
await graph.invoke(
|
|
450
|
+
{ messages: [{ role: "user", content: "It is rainy and 70 degrees!" }] },
|
|
451
|
+
{ configurable: { thread_id: "1" } }
|
|
452
|
+
);
|
|
814
453
|
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
)
|
|
454
|
+
const extractedTrajectory = await extractLangGraphTrajectoryFromThread(
|
|
455
|
+
graph,
|
|
456
|
+
{ configurable: { thread_id: "1" } },
|
|
457
|
+
);
|
|
819
458
|
|
|
820
|
-
|
|
459
|
+
console.log(extractedTrajectory);
|
|
821
460
|
```
|
|
822
461
|
|
|
823
462
|
```
|
|
@@ -850,17 +489,21 @@ print(extracted_trajectory)
|
|
|
850
489
|
}
|
|
851
490
|
```
|
|
852
491
|
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
492
|
+
Now, we can pass the extracted trajectory to the evaluator:
|
|
493
|
+
|
|
494
|
+
```ts
|
|
495
|
+
import { createGraphTrajectoryLLMAsJudge } from "agentevals";
|
|
496
|
+
|
|
497
|
+
const graphTrajectoryEvaluator = createGraphTrajectoryLLMAsJudge({
|
|
498
|
+
model: "openai:o3-mini",
|
|
499
|
+
})
|
|
857
500
|
|
|
858
|
-
res =
|
|
859
|
-
inputs=
|
|
860
|
-
outputs=
|
|
501
|
+
const res = await graphTrajectoryEvaluator(
|
|
502
|
+
inputs=extractedTrajectory.inputs,
|
|
503
|
+
outputs=extractedTrajectory.outputs,
|
|
861
504
|
)
|
|
862
505
|
|
|
863
|
-
|
|
506
|
+
console.log(res);
|
|
864
507
|
```
|
|
865
508
|
|
|
866
509
|
```
|
|
@@ -871,10 +514,10 @@ print(res)
|
|
|
871
514
|
}
|
|
872
515
|
```
|
|
873
516
|
|
|
874
|
-
Note that though this evaluator takes the typical `inputs`, `outputs`, and `
|
|
517
|
+
Note that though this evaluator takes the typical `inputs`, `outputs`, and `referenceOutputs` parameters, it internally combines `inputs` and `outputs` to form a `thread`. Therefore, if you want to customize the prompt, your prompt should also contain a `thread` input variable:
|
|
875
518
|
|
|
876
|
-
```
|
|
877
|
-
CUSTOM_PROMPT =
|
|
519
|
+
```ts
|
|
520
|
+
const CUSTOM_PROMPT = `You are an expert data labeler.
|
|
878
521
|
Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries.
|
|
879
522
|
|
|
880
523
|
<Rubric>
|
|
@@ -896,82 +539,85 @@ Your task is to grade the accuracy of an AI agent's internal steps in resolving
|
|
|
896
539
|
</thread>
|
|
897
540
|
|
|
898
541
|
{reference_outputs}
|
|
899
|
-
|
|
542
|
+
`
|
|
900
543
|
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
)
|
|
905
|
-
res = await
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
544
|
+
const graphTrajectoryEvaluator = createGraphTrajectoryLLMAsJudge({
|
|
545
|
+
prompt: CUSTOM_PROMPT,
|
|
546
|
+
model: "openai:o3-mini",
|
|
547
|
+
})
|
|
548
|
+
res = await graphTrajectoryEvaluator(
|
|
549
|
+
inputs=extractedTrajectory.inputs,
|
|
550
|
+
outputs=extractedTrajectory.outputs,
|
|
909
551
|
)
|
|
910
552
|
```
|
|
911
553
|
|
|
912
|
-
In order to format them properly into the prompt, `
|
|
554
|
+
In order to format them properly into the prompt, `referenceOutputs` should be passed in as a `GraphTrajectory` object like `outputs`.
|
|
913
555
|
|
|
914
556
|
Also note that like other LLM-as-judge evaluators, you can pass extra kwargs into the evaluator to format them into the prompt.
|
|
915
557
|
|
|
916
558
|
#### Graph trajectory strict match
|
|
917
559
|
|
|
918
|
-
The `
|
|
560
|
+
The `graphTrajectoryStrictMatch` evaluator is a simple evaluator that checks if the steps in the provided graph trajectory match the reference trajectory exactly.
|
|
919
561
|
|
|
920
|
-
```
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
tools = [search]
|
|
562
|
+
```ts
|
|
563
|
+
import { tool } from "@langchain/core/tools";
|
|
564
|
+
import { ChatOpenAI } from "@langchain/openai";
|
|
565
|
+
import { createReactAgent } from "@langchain/langgraph/prebuilt";
|
|
566
|
+
import { MemorySaver, interrupt } from "@langchain/langgraph";
|
|
567
|
+
import { z } from "zod";
|
|
568
|
+
import { extractLangGraphTrajectoryFromThread, graphTrajectoryStrictMatch } from "agentevals";
|
|
569
|
+
|
|
570
|
+
const search = tool((_): string => {
|
|
571
|
+
const userAnswer = interrupt("Tell me the answer to the question.")
|
|
572
|
+
return userAnswer;
|
|
573
|
+
}, {
|
|
574
|
+
name: "search",
|
|
575
|
+
description: "Call to surf the web.",
|
|
576
|
+
schema: z.object({
|
|
577
|
+
query: z.string()
|
|
578
|
+
})
|
|
579
|
+
})
|
|
580
|
+
|
|
581
|
+
const tools = [search];
|
|
582
|
+
|
|
583
|
+
// Create a checkpointer
|
|
584
|
+
const checkpointer = new MemorySaver();
|
|
585
|
+
|
|
586
|
+
// Create the React agent
|
|
587
|
+
const graph = createReactAgent({
|
|
588
|
+
llm: new ChatOpenAI({ model: "gpt-4o-mini" }),
|
|
589
|
+
tools,
|
|
590
|
+
checkpointer,
|
|
591
|
+
});
|
|
940
592
|
|
|
941
|
-
|
|
942
|
-
graph
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
)
|
|
593
|
+
// Invoke the graph with initial message
|
|
594
|
+
await graph.invoke(
|
|
595
|
+
{ messages: [{ role: "user", content: "what's the weather in sf?" }] },
|
|
596
|
+
{ configurable: { thread_id: "1" } }
|
|
597
|
+
);
|
|
947
598
|
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
graph.invoke(
|
|
954
|
-
Command(resume="It is rainy and 70 degrees!"),
|
|
955
|
-
config={"configurable": {"thread_id": "1"}},
|
|
956
|
-
)
|
|
599
|
+
// Resume the agent with a new command (simulating human-in-the-loop)
|
|
600
|
+
await graph.invoke(
|
|
601
|
+
{ messages: [{ role: "user", content: "It is rainy and 70 degrees!" }] },
|
|
602
|
+
{ configurable: { thread_id: "1" } }
|
|
603
|
+
);
|
|
957
604
|
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
)
|
|
605
|
+
const extractedTrajectory = await extractLangGraphTrajectoryFromThread(
|
|
606
|
+
graph,
|
|
607
|
+
{ configurable: { thread_id: "1" } },
|
|
608
|
+
);
|
|
962
609
|
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
"steps": [["__start__", "agent", "tools", "__interrupt__"], ["agent"]],
|
|
610
|
+
const referenceTrajectory = {
|
|
611
|
+
results: [],
|
|
612
|
+
steps: [["__start__", "agent", "tools", "__interrupt__"], ["agent"]],
|
|
967
613
|
}
|
|
968
614
|
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
)
|
|
615
|
+
const result = await graphTrajectoryStrictMatch({
|
|
616
|
+
outputs: trajectory.outputs,
|
|
617
|
+
referenceOutputs: referenceOutputs!,
|
|
618
|
+
});
|
|
973
619
|
|
|
974
|
-
|
|
620
|
+
console.log(result);
|
|
975
621
|
```
|
|
976
622
|
|
|
977
623
|
```
|
|
@@ -980,37 +626,6 @@ print(res)
|
|
|
980
626
|
'score': True,
|
|
981
627
|
}
|
|
982
628
|
```
|
|
983
|
-
|
|
984
|
-
## Python Async Support
|
|
985
|
-
|
|
986
|
-
All `agentevals` evaluators support Python [asyncio](https://docs.python.org/3/library/asyncio.html). As a convention, evaluators that use a factory function will have `async` put immediately after `create_` in the function name (for example, `create_async_trajectory_llm_as_judge`), and evaluators used directly will end in `async` (e.g. `trajectory_strict_match_async`).
|
|
987
|
-
|
|
988
|
-
Here's an example of how to use the `create_async_llm_as_judge` evaluator asynchronously:
|
|
989
|
-
|
|
990
|
-
```python
|
|
991
|
-
from agentevals.trajectory.llm import create_async_trajectory_llm_as_judge
|
|
992
|
-
|
|
993
|
-
evaluator = create_async_llm_as_judge(
|
|
994
|
-
prompt="What is the weather in {inputs}?",
|
|
995
|
-
)
|
|
996
|
-
|
|
997
|
-
result = await evaluator(inputs="San Francisco")
|
|
998
|
-
```
|
|
999
|
-
|
|
1000
|
-
If you are using the OpenAI client directly, remember to pass in `AsyncOpenAI` as the `judge` parameter:
|
|
1001
|
-
|
|
1002
|
-
```python
|
|
1003
|
-
from openai import AsyncOpenAI
|
|
1004
|
-
|
|
1005
|
-
evaluator = create_async_llm_as_judge(
|
|
1006
|
-
prompt="What is the weather in {inputs}?",
|
|
1007
|
-
judge=AsyncOpenAI(),
|
|
1008
|
-
model="o3-mini",
|
|
1009
|
-
)
|
|
1010
|
-
|
|
1011
|
-
result = await evaluator(inputs="San Francisco")
|
|
1012
|
-
```
|
|
1013
|
-
|
|
1014
629
|
## LangSmith Integration
|
|
1015
630
|
|
|
1016
631
|
For tracking experiments over time, you can log evaluator results to [LangSmith](https://smith.langchain.com/), a platform for building production-grade LLM applications that includes tracing, evaluation, and experimentation tools.
|
|
@@ -1019,7 +634,7 @@ LangSmith currently offers two ways to run evals. We'll give a quick example of
|
|
|
1019
634
|
|
|
1020
635
|
### Pytest or Vitest/Jest
|
|
1021
636
|
|
|
1022
|
-
First, follow [these instructions](https://docs.smith.langchain.com/evaluation/how_to_guides/
|
|
637
|
+
First, follow [these instructions](https://docs.smith.langchain.com/evaluation/how_to_guides/vitest_jest) to set up LangSmith's Vitest/Jest runner,
|
|
1023
638
|
setting appropriate environment variables:
|
|
1024
639
|
|
|
1025
640
|
```bash
|
|
@@ -1027,80 +642,6 @@ export LANGSMITH_API_KEY="your_langsmith_api_key"
|
|
|
1027
642
|
export LANGSMITH_TRACING="true"
|
|
1028
643
|
```
|
|
1029
644
|
|
|
1030
|
-
<details>
|
|
1031
|
-
<summary>Python</summary>
|
|
1032
|
-
|
|
1033
|
-
Then, set up a file named `test_trajectory.py` with the following contents:
|
|
1034
|
-
|
|
1035
|
-
```python
|
|
1036
|
-
import pytest
|
|
1037
|
-
import json
|
|
1038
|
-
|
|
1039
|
-
from langsmith import testing as t
|
|
1040
|
-
|
|
1041
|
-
from agentevals.trajectory.llm import create_trajectory_llm_as_judge
|
|
1042
|
-
|
|
1043
|
-
trajectory_evaluator = create_trajectory_llm_as_judge(
|
|
1044
|
-
model="openai:o3-mini",
|
|
1045
|
-
)
|
|
1046
|
-
|
|
1047
|
-
@pytest.mark.langsmith
|
|
1048
|
-
def test_trajectory_accuracy():
|
|
1049
|
-
outputs = [
|
|
1050
|
-
{"role": "user", "content": "What is the weather in SF?"},
|
|
1051
|
-
{
|
|
1052
|
-
"role": "assistant",
|
|
1053
|
-
"tool_calls": [
|
|
1054
|
-
{
|
|
1055
|
-
"function": {
|
|
1056
|
-
"name": "get_weather",
|
|
1057
|
-
"arguments": json.dumps({"city": "SF"}),
|
|
1058
|
-
}
|
|
1059
|
-
}
|
|
1060
|
-
],
|
|
1061
|
-
},
|
|
1062
|
-
{"role": "tool", "content": "It's 80 degrees and sunny in SF."},
|
|
1063
|
-
{"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
|
|
1064
|
-
]
|
|
1065
|
-
reference_outputs = [
|
|
1066
|
-
{"role": "user", "content": "What is the weather in SF?"},
|
|
1067
|
-
{
|
|
1068
|
-
"role": "assistant",
|
|
1069
|
-
"tool_calls": [
|
|
1070
|
-
{
|
|
1071
|
-
"function": {
|
|
1072
|
-
"name": "get_weather",
|
|
1073
|
-
"arguments": json.dumps({"city": "San Francisco"}),
|
|
1074
|
-
}
|
|
1075
|
-
}
|
|
1076
|
-
],
|
|
1077
|
-
},
|
|
1078
|
-
{"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."},
|
|
1079
|
-
{"role": "assistant", "content": "The weather in SF is 80˚ and sunny."},
|
|
1080
|
-
]
|
|
1081
|
-
|
|
1082
|
-
t.log_inputs({})
|
|
1083
|
-
t.log_outputs({"messages": outputs})
|
|
1084
|
-
t.log_reference_outputs({"messages": reference_outputs})
|
|
1085
|
-
|
|
1086
|
-
trajectory_evaluator(
|
|
1087
|
-
outputs=outputs,
|
|
1088
|
-
reference_outputs=reference_outputs
|
|
1089
|
-
)
|
|
1090
|
-
```
|
|
1091
|
-
|
|
1092
|
-
Note that when creating the evaluator, we've added a `feedback_key` parameter. This will be used to name the feedback in LangSmith.
|
|
1093
|
-
|
|
1094
|
-
Now, run the eval with pytest:
|
|
1095
|
-
|
|
1096
|
-
```bash
|
|
1097
|
-
pytest test_trajectory.py --langsmith-output
|
|
1098
|
-
```
|
|
1099
|
-
|
|
1100
|
-
</details>
|
|
1101
|
-
|
|
1102
|
-
<details open>
|
|
1103
|
-
<summary>TypeScript</summary>
|
|
1104
645
|
|
|
1105
646
|
Then, set up a file named `test_trajectory.eval.ts` with the following contents:
|
|
1106
647
|
|
|
@@ -1176,7 +717,6 @@ Now, run the eval with your runner of choice:
|
|
|
1176
717
|
vitest run test_trajectory.eval.ts
|
|
1177
718
|
```
|
|
1178
719
|
|
|
1179
|
-
</details>
|
|
1180
720
|
|
|
1181
721
|
Feedback from the prebuilt evaluator will be automatically logged in LangSmith as a table of results like this in your terminal:
|
|
1182
722
|
|
|
@@ -1190,51 +730,38 @@ And you should also see the results in the experiment view in LangSmith:
|
|
|
1190
730
|
|
|
1191
731
|
Alternatively, you can [create a dataset in LangSmith](https://docs.smith.langchain.com/evaluation/concepts#dataset-curation) and use your created evaluators with LangSmith's [`evaluate`](https://docs.smith.langchain.com/evaluation#8-run-and-view-results) function:
|
|
1192
732
|
|
|
1193
|
-
<details>
|
|
1194
|
-
<summary>Python</summary>
|
|
1195
|
-
|
|
1196
|
-
```python
|
|
1197
|
-
from langsmith import Client
|
|
1198
|
-
from agentevals.trajectory.llm import create_trajectory_llm_as_judge
|
|
1199
|
-
|
|
1200
|
-
client = Client()
|
|
1201
|
-
|
|
1202
|
-
trajectory_evaluator = create_trajectory_llm_as_judge(
|
|
1203
|
-
model="openai:o3-mini",
|
|
1204
|
-
)
|
|
1205
|
-
|
|
1206
|
-
experiment_results = client.evaluate(
|
|
1207
|
-
# This is a dummy target function, replace with your actual LLM-based system
|
|
1208
|
-
lambda inputs: "What color is the sky?",
|
|
1209
|
-
data="Sample dataset",
|
|
1210
|
-
evaluators=[
|
|
1211
|
-
trajectory_evaluator
|
|
1212
|
-
]
|
|
1213
|
-
)
|
|
1214
|
-
```
|
|
1215
|
-
|
|
1216
|
-
</details>
|
|
1217
|
-
|
|
1218
|
-
<details open>
|
|
1219
|
-
<summary>TypeScript</summary>
|
|
1220
|
-
|
|
1221
733
|
```ts
|
|
1222
734
|
import { evaluate } from "langsmith/evaluation";
|
|
1223
|
-
import { createTrajectoryLLMAsJudge,
|
|
735
|
+
import { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT } from "agentevals";
|
|
1224
736
|
|
|
1225
737
|
const trajectoryEvaluator = createTrajectoryLLMAsJudge({
|
|
1226
738
|
model: "openai:o3-mini",
|
|
739
|
+
prompt: TRAJECTORY_ACCURACY_PROMPT
|
|
1227
740
|
});
|
|
1228
741
|
|
|
1229
742
|
await evaluate(
|
|
1230
|
-
(inputs) =>
|
|
743
|
+
(inputs) => [
|
|
744
|
+
{role: "user", content: "What is the weather in SF?"},
|
|
745
|
+
{
|
|
746
|
+
role: "assistant",
|
|
747
|
+
tool_calls: [
|
|
748
|
+
{
|
|
749
|
+
function: {
|
|
750
|
+
name: "get_weather",
|
|
751
|
+
arguments: json.dumps({"city": "SF"}),
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
],
|
|
755
|
+
},
|
|
756
|
+
{role: "tool", content: "It's 80 degrees and sunny in SF."},
|
|
757
|
+
{role: "assistant", content: "The weather in SF is 80 degrees and sunny."},
|
|
758
|
+
],
|
|
1231
759
|
{
|
|
1232
760
|
data: datasetName,
|
|
1233
761
|
evaluators: [trajectoryEvaluator],
|
|
1234
762
|
}
|
|
1235
763
|
);
|
|
1236
764
|
```
|
|
1237
|
-
</details>
|
|
1238
765
|
|
|
1239
766
|
## Thank you!
|
|
1240
767
|
|
package/dist/utils.cjs
CHANGED
|
@@ -3,7 +3,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
3
3
|
exports._runEvaluator = exports.processScore = exports._normalizeToOpenAIMessagesList = exports._convertToOpenAIMessage = void 0;
|
|
4
4
|
const messages_1 = require("@langchain/core/messages");
|
|
5
5
|
const openai_1 = require("@langchain/openai");
|
|
6
|
-
const
|
|
6
|
+
const utils_1 = require("openevals/utils");
|
|
7
7
|
const _convertToOpenAIMessage = (message) => {
|
|
8
8
|
if ((0, messages_1.isBaseMessage)(message)) {
|
|
9
9
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
@@ -51,39 +51,6 @@ const processScore = (_, value) => {
|
|
|
51
51
|
};
|
|
52
52
|
exports.processScore = processScore;
|
|
53
53
|
const _runEvaluator = async (runName, scorer, feedbackKey, extra) => {
|
|
54
|
-
|
|
55
|
-
let score = await scorer(params);
|
|
56
|
-
let reasoning;
|
|
57
|
-
const results = [];
|
|
58
|
-
if (!Array.isArray(score) && typeof score === "object") {
|
|
59
|
-
for (const [key, value] of Object.entries(score)) {
|
|
60
|
-
const [keyScore, reasoning] = (0, exports.processScore)(key, value);
|
|
61
|
-
results.push({ key, score: keyScore, comment: reasoning });
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
else {
|
|
65
|
-
if (Array.isArray(score)) {
|
|
66
|
-
reasoning = score[1];
|
|
67
|
-
score = score[0];
|
|
68
|
-
}
|
|
69
|
-
results.push({ key: feedbackKey, score, comment: reasoning });
|
|
70
|
-
}
|
|
71
|
-
if (results.length === 1) {
|
|
72
|
-
return results[0];
|
|
73
|
-
}
|
|
74
|
-
else {
|
|
75
|
-
return results;
|
|
76
|
-
}
|
|
77
|
-
};
|
|
78
|
-
if ((0, jestlike_1.isInTestContext)()) {
|
|
79
|
-
const res = await (0, jestlike_1.wrapEvaluator)(runScorer)(extra ?? {}, {
|
|
80
|
-
name: runName,
|
|
81
|
-
});
|
|
82
|
-
return res;
|
|
83
|
-
}
|
|
84
|
-
else {
|
|
85
|
-
const res = await runScorer(extra ?? {});
|
|
86
|
-
return res;
|
|
87
|
-
}
|
|
54
|
+
return (0, utils_1._runEvaluator)(runName, scorer, feedbackKey, extra, "agentevals");
|
|
88
55
|
};
|
|
89
56
|
exports._runEvaluator = _runEvaluator;
|
package/dist/utils.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { BaseMessage } from "@langchain/core/messages";
|
|
2
|
-
import {
|
|
2
|
+
import { EvaluationResultType } from "openevals/utils";
|
|
3
3
|
import { ChatCompletionMessage, MultiResultScorerReturnType, SingleResultScorerReturnType } from "./types.js";
|
|
4
4
|
export declare const _convertToOpenAIMessage: (message: BaseMessage | ChatCompletionMessage) => ChatCompletionMessage;
|
|
5
5
|
export declare const _normalizeToOpenAIMessagesList: (messages?: (BaseMessage | ChatCompletionMessage)[] | {
|
|
@@ -9,5 +9,4 @@ export declare const processScore: (_: string, value: boolean | number | {
|
|
|
9
9
|
score: boolean | number;
|
|
10
10
|
reasoning?: string;
|
|
11
11
|
}) => readonly [number | boolean, string | undefined] | readonly [number | boolean];
|
|
12
|
-
export
|
|
13
|
-
export declare const _runEvaluator: <T extends Record<string, unknown>, O extends MultiResultScorerReturnType | SingleResultScorerReturnType | Promise<MultiResultScorerReturnType | SingleResultScorerReturnType>>(runName: string, scorer: (params: T) => O, feedbackKey: string, extra?: T | undefined) => Promise<EvaluationResultType<O>>;
|
|
12
|
+
export declare const _runEvaluator: <T extends Record<string, unknown>, O extends SingleResultScorerReturnType | MultiResultScorerReturnType | Promise<SingleResultScorerReturnType | MultiResultScorerReturnType>>(runName: string, scorer: (params: T) => O, feedbackKey: string, extra?: T | undefined) => Promise<EvaluationResultType<O>>;
|
package/dist/utils.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { isBaseMessage } from "@langchain/core/messages";
|
|
2
2
|
import { _convertMessagesToOpenAIParams } from "@langchain/openai";
|
|
3
|
-
import {
|
|
3
|
+
import { _runEvaluator as baseRunEvaluator, } from "openevals/utils";
|
|
4
4
|
export const _convertToOpenAIMessage = (message) => {
|
|
5
5
|
if (isBaseMessage(message)) {
|
|
6
6
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
@@ -45,38 +45,5 @@ export const processScore = (_, value) => {
|
|
|
45
45
|
return [value];
|
|
46
46
|
};
|
|
47
47
|
export const _runEvaluator = async (runName, scorer, feedbackKey, extra) => {
|
|
48
|
-
|
|
49
|
-
let score = await scorer(params);
|
|
50
|
-
let reasoning;
|
|
51
|
-
const results = [];
|
|
52
|
-
if (!Array.isArray(score) && typeof score === "object") {
|
|
53
|
-
for (const [key, value] of Object.entries(score)) {
|
|
54
|
-
const [keyScore, reasoning] = processScore(key, value);
|
|
55
|
-
results.push({ key, score: keyScore, comment: reasoning });
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
else {
|
|
59
|
-
if (Array.isArray(score)) {
|
|
60
|
-
reasoning = score[1];
|
|
61
|
-
score = score[0];
|
|
62
|
-
}
|
|
63
|
-
results.push({ key: feedbackKey, score, comment: reasoning });
|
|
64
|
-
}
|
|
65
|
-
if (results.length === 1) {
|
|
66
|
-
return results[0];
|
|
67
|
-
}
|
|
68
|
-
else {
|
|
69
|
-
return results;
|
|
70
|
-
}
|
|
71
|
-
};
|
|
72
|
-
if (isInTestContext()) {
|
|
73
|
-
const res = await wrapEvaluator(runScorer)(extra ?? {}, {
|
|
74
|
-
name: runName,
|
|
75
|
-
});
|
|
76
|
-
return res;
|
|
77
|
-
}
|
|
78
|
-
else {
|
|
79
|
-
const res = await runScorer(extra ?? {});
|
|
80
|
-
return res;
|
|
81
|
-
}
|
|
48
|
+
return baseRunEvaluator(runName, scorer, feedbackKey, extra, "agentevals");
|
|
82
49
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentevals",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.2",
|
|
4
4
|
"packageManager": "yarn@3.5.1",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"scripts": {
|
|
@@ -16,8 +16,8 @@
|
|
|
16
16
|
"dependencies": {
|
|
17
17
|
"@langchain/openai": "^0.4.4",
|
|
18
18
|
"langchain": "^0.3.18",
|
|
19
|
-
"langsmith": "^0.3.
|
|
20
|
-
"openevals": "^0.0.
|
|
19
|
+
"langsmith": "^0.3.11",
|
|
20
|
+
"openevals": "^0.0.3"
|
|
21
21
|
},
|
|
22
22
|
"peerDependencies": {
|
|
23
23
|
"@langchain/core": "^0.3.40",
|