agentevals 0.0.1-rc.3 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -13,21 +13,9 @@ If you are looking for more general evaluation tools, please check out the compa
13
13
 
14
14
  To get started, install `agentevals`:
15
15
 
16
- <details>
17
- <summary>Python</summary>
18
-
19
- ```bash
20
- pip install agentevals
21
- ```
22
- </details>
23
-
24
- <details open>
25
- <summary>TypeScript</summary>
26
-
27
16
  ```bash
28
17
  npm install agentevals @langchain/core
29
18
  ```
30
- </details>
31
19
 
32
20
  This quickstart will use an evaluator powered by OpenAI's `o3-mini` model to judge your results, so you'll need to set your OpenAI API key as an environment variable:
33
21
 
@@ -37,54 +25,6 @@ export OPENAI_API_KEY="your_openai_api_key"
37
25
 
38
26
  Once you've done this, you can run your first trajectory evaluator. We represent the agent's trajectory as a list of OpenAI-style messages:
39
27
 
40
- <details>
41
- <summary>Python</summary>
42
-
43
- ```python
44
- from agentevals.trajectory.llm import create_trajectory_llm_as_judge, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE
45
-
46
- trajectory_evaluator = create_trajectory_llm_as_judge(
47
- prompt=TRAJECTORY_ACCURACY_PROMPT,
48
- model="openai:o3-mini",
49
- )
50
-
51
- # This is a fake trajectory, in reality you would run your agent to get a real trajectory
52
- outputs = [
53
- {"role": "user", "content": "What is the weather in SF?"},
54
- {
55
- "role": "assistant",
56
- "tool_calls": [
57
- {
58
- "function": {
59
- "name": "get_weather",
60
- "arguments": json.dumps({"city": "SF"}),
61
- }
62
- }
63
- ],
64
- },
65
- {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
66
- {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
67
- ]
68
-
69
- eval_result = trajectory_evaluator(
70
- outputs=outputs,
71
- )
72
-
73
- print(eval_result)
74
- ```
75
-
76
- ```
77
- {
78
- 'key': 'trajectory_accuracy',
79
- 'reasoning': 'The trajectory accurately follows the user's request for weather information in SF. Initially, the assistant recognizes the goal (providing weather details), then it efficiently makes a tool call to get the weather, and finally it communicates the result clearly. All steps demonstrate logical progression and efficiency. Thus, the score should be: true.',
80
- 'score': true
81
- }
82
- ```
83
- </details>
84
-
85
- <details open>
86
- <summary>TypeScript</summary>
87
-
88
28
  ```ts
89
29
  import {
90
30
  createTrajectoryLLMAsJudge,
@@ -131,7 +71,6 @@ console.log(evalResult);
131
71
  comment: '...'
132
72
  }
133
73
  ```
134
- </details>
135
74
 
136
75
  You can see that despite the small difference in the final response and tool calls, the evaluator still returns a score of `true` since the overall trajectory is the same between the output and reference!
137
76
 
@@ -156,42 +95,18 @@ You can see that despite the small difference in the final response and tool cal
156
95
 
157
96
  You can install `agentevals` like this:
158
97
 
159
- <details>
160
- <summary>Python</summary>
161
-
162
- ```bash
163
- pip install agentevals
164
- ```
165
- </details>
166
-
167
- <details open>
168
- <summary>TypeScript</summary>
169
-
170
98
  ```bash
171
99
  npm install agentevals @langchain/core
172
100
  ```
173
- </details>
174
101
 
175
102
  For LLM-as-judge evaluators, you will also need an LLM client. By default, `agentevals` will use [LangChain chat model integrations](https://python.langchain.com/docs/integrations/chat/) and comes with `langchain_openai` installed by default. However, if you prefer, you may use the OpenAI client directly:
176
103
 
177
- <details>
178
- <summary>Python</summary>
179
-
180
- ```bash
181
- pip install openai
182
- ```
183
- </details>
184
-
185
- <details open>
186
- <summary>TypeScript</summary>
187
-
188
104
  ```bash
189
105
  npm install openai
190
106
  ```
191
- </details>
192
107
 
193
108
  It is also helpful to be familiar with some [evaluation concepts](https://docs.smith.langchain.com/evaluation/concepts) and
194
- LangSmith's pytest integration for running evals, which is documented [here](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest).
109
+ LangSmith's Vitest/Jest integration for running evals, which is documented [here](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest).
195
110
 
196
111
  ## Evaluators
197
112
 
@@ -207,64 +122,6 @@ The `trajectory_strict_match` evaluator, compares two trajectories and ensures t
207
122
  in the same order with the same tool calls. It allows for differences in message content and tool call arguments,
208
123
  but requires that the selected tools at each step are the same.
209
124
 
210
- <details>
211
- <summary>Python</summary>
212
-
213
- ```python
214
- import json
215
- from agentevals.trajectory.strict import trajectory_strict_match
216
-
217
- outputs = [
218
- {"role": "user", "content": "What is the weather in SF?"},
219
- {
220
- "role": "assistant",
221
- "tool_calls": [
222
- {
223
- "function": {
224
- "name": "get_weather",
225
- "arguments": json.dumps({"city": "SF"}),
226
- }
227
- }
228
- ],
229
- },
230
- {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
231
- {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
232
- ]
233
- reference_outputs = [
234
- {"role": "user", "content": "What is the weather in San Francisco?"},
235
- {
236
- "role": "assistant",
237
- "tool_calls": [
238
- {
239
- "function": {
240
- "name": "get_weather",
241
- "arguments": json.dumps({"city": "San Francisco"}),
242
- }
243
- }
244
- ],
245
- },
246
- {"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."},
247
- {"role": "assistant", "content": "The weather in SF is 80˚ and sunny."},
248
- ]
249
- result = trajectory_strict_match(
250
- outputs=outputs, reference_outputs=reference_outputs
251
- )
252
-
253
- print(result)
254
- ```
255
-
256
- ```
257
- {
258
- 'key': 'trajectory_accuracy',
259
- 'score': True,
260
- 'comment': None,
261
- }
262
- ```
263
- </details>
264
-
265
- <details open>
266
- <summary>TypeScript</summary>
267
-
268
125
  ```ts
269
126
  import { trajectoryStrictMatch } from "agentevals";
270
127
 
@@ -300,86 +157,11 @@ console.log(result);
300
157
  'score': true,
301
158
  }
302
159
  ```
303
- </details>
304
160
 
305
161
  #### Unordered match
306
162
 
307
163
  The `trajectory_unordered_match` evaluator, compares two trajectories and ensures that they contain the same number of tool calls in any order. This is useful if you want to allow flexibility in how an agent obtains the proper information, but still do care that all information was retrieved.
308
164
 
309
- <details>
310
- <summary>Python</summary>
311
-
312
- ```python
313
- import json
314
- from agentevals.trajectory.unordered import trajectory_unordered_match
315
-
316
- inputs = {}
317
- outputs = [
318
- {"role": "user", "content": "What is the weather in SF and is there anything fun happening?"},
319
- {
320
- "role": "assistant",
321
- "tool_calls": [{
322
- "function": {
323
- "name": "get_weather",
324
- "arguments": json.dumps({"city": "SF"}),
325
- }
326
- }],
327
- },
328
- {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
329
- {
330
- "role": "assistant",
331
- "tool_calls": [{
332
- "function": {
333
- "name": "get_fun_activities",
334
- "arguments": json.dumps({"city": "SF"}),
335
- }
336
- }],
337
- },
338
- {"role": "tool", "content": "Nothing fun is happening, you should stay indoors and read!"},
339
- {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny, but there is nothing fun happening."},
340
- ]
341
- reference_outputs = [
342
- {"role": "user", "content": "What is the weather in SF and is there anything fun happening?"},
343
- {
344
- "role": "assistant",
345
- "tool_calls": [
346
- {
347
- "function": {
348
- "name": "get_fun_activities",
349
- "arguments": json.dumps({"city": "San Francisco"}),
350
- }
351
- },
352
- {
353
- "function": {
354
- "name": "get_weather",
355
- "arguments": json.dumps({"city": "San Francisco"}),
356
- }
357
- },
358
- ],
359
- },
360
- {"role": "tool", "content": "Nothing fun is happening, you should stay indoors and read!"},
361
- {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
362
- {"role": "assistant", "content": "In SF, it's 80˚ and sunny, but there is nothing fun happening."},
363
- ]
364
- result = trajectory_unordered_match(
365
- outputs=outputs, reference_outputs=reference_outputs
366
- )
367
-
368
- print(result)
369
- ```
370
-
371
- ```
372
- {
373
- 'key': 'trajectory_unordered_match',
374
- 'score': True,
375
- 'comment': None,
376
- }
377
- ```
378
- </details>
379
-
380
- <details open>
381
- <summary>TypeScript</summary>
382
-
383
165
  ```ts
384
166
  import { trajectoryUnorderedMatch } from "agentevals";
385
167
 
@@ -446,77 +228,11 @@ console.log(result)
446
228
  'score': true,
447
229
  }
448
230
  ```
449
- </details>
450
231
 
451
232
  #### Subset and superset match
452
233
 
453
234
  There are other evaluators for checking partial trajectory matches (ensuring that a trajectory contains a subset and superset of tool calls compared to a reference trajectory).
454
235
 
455
- <details>
456
- <summary>Python</summary>
457
-
458
- ```python
459
- import json
460
- from openevals.trajectory.subset import trajectory_subset
461
- # from openevals.trajectory.superset import trajectory_superset
462
-
463
- outputs = [
464
- {"role": "user", "content": "What is the weather in SF and London?"},
465
- {
466
- "role": "assistant",
467
- "tool_calls": [{
468
- "function": {
469
- "name": "get_weather",
470
- "arguments": json.dumps({"city": "SF and London"}),
471
- }
472
- }],
473
- },
474
- {"role": "tool", "content": "It's 80 degrees and sunny in SF, and 90 degrees and rainy in London."},
475
- {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny. In London, it's 90 degrees and rainy."},
476
- ]
477
- reference_outputs = [
478
- {"role": "user", "content": "What is the weather in SF and London?"},
479
- {
480
- "role": "assistant",
481
- "tool_calls": [
482
- {
483
- "function": {
484
- "name": "get_weather",
485
- "arguments": json.dumps({"city": "San Francisco"}),
486
- }
487
- },
488
- {
489
- "function": {
490
- "name": "get_weather",
491
- "arguments": json.dumps({"city": "London"}),
492
- }
493
- },
494
- ],
495
- },
496
- {"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."},
497
- {"role": "tool", "content": "It's 90 degrees and rainy in London."},
498
- {"role": "assistant", "content": "The weather in SF is 80˚ and sunny. In London, it's 90˚ and rainy."},
499
- ]
500
-
501
- result = trajectory_subset(
502
- outputs=outputs, reference_outputs=reference_outputs
503
- )
504
-
505
- print(result)
506
- ```
507
-
508
- ```
509
- {
510
- 'key': 'trajectory_subset',
511
- 'score': True,
512
- 'comment': None,
513
- }
514
- ```
515
- </details>
516
-
517
- <details open>
518
- <summary>TypeScript</summary>
519
-
520
236
  ```ts
521
237
  import { trajectorySubset } from "agentevals";
522
238
  // import { trajectorySuperset } from "agentevals";
@@ -574,7 +290,6 @@ console.log(result)
574
290
  'score': true,
575
291
  }
576
292
  ```
577
- </details>
578
293
 
579
294
  #### Trajectory LLM-as-judge
580
295
 
@@ -582,69 +297,6 @@ The LLM-as-judge trajectory evaluator that uses an LLM to evaluate the trajector
582
297
  and supports
583
298
  This allows for more flexibility in the trajectory comparison:
584
299
 
585
- <details>
586
- <summary>Python</summary>
587
-
588
- ```python
589
- import json
590
- from openevals.trajectory.llm import create_trajectory_llm_as_judge, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE
591
-
592
- evaluator = create_trajectory_llm_as_judge(
593
- prompt=TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,
594
- model="openai:o3-mini"
595
- )
596
- outputs = [
597
- {"role": "user", "content": "What is the weather in SF?"},
598
- {
599
- "role": "assistant",
600
- "tool_calls": [
601
- {
602
- "function": {
603
- "name": "get_weather",
604
- "arguments": json.dumps({"city": "SF"}),
605
- }
606
- }
607
- ],
608
- },
609
- {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
610
- {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
611
- ]
612
- reference_outputs = [
613
- {"role": "user", "content": "What is the weather in SF?"},
614
- {
615
- "role": "assistant",
616
- "tool_calls": [
617
- {
618
- "function": {
619
- "name": "get_weather",
620
- "arguments": json.dumps({"city": "San Francisco"}),
621
- }
622
- }
623
- ],
624
- },
625
- {"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."},
626
- {"role": "assistant", "content": "The weather in SF is 80˚ and sunny."},
627
- ]
628
- eval_result = evaluator(
629
- outputs=outputs,
630
- reference_outputs=reference_outputs,
631
- )
632
-
633
- print(eval_result)
634
- ```
635
-
636
- ```
637
- {
638
- 'key': 'trajectory_accuracy',
639
- 'score': True,
640
- 'comment': 'The provided agent trajectory is consistent with the reference. Both trajectories start with the same user query and then correctly invoke a weather lookup through a tool call. Although the reference uses "San Francisco" while the provided trajectory uses "SF" and there is a minor formatting difference (degrees vs. ˚), these differences do not affect the correctness or essential steps of the process. Thus, the score should be: true.'
641
- }
642
- ```
643
- </details>
644
-
645
- <details open>
646
- <summary>TypeScript</summary>
647
-
648
300
  ```ts
649
301
  import {
650
302
  createTrajectoryLLMAsJudge,
@@ -704,7 +356,6 @@ console.log(result)
704
356
  'comment': 'The provided agent trajectory is consistent with the reference. Both trajectories start with the same user query and then correctly invoke a weather lookup through a tool call. Although the reference uses "San Francisco" while the provided trajectory uses "SF" and there is a minor formatting difference (degrees vs. ˚), these differences do not affect the correctness or essential steps of the process. Thus, the score should be: true.'
705
357
  }
706
358
  ```
707
- </details>
708
359
 
709
360
  `create_trajectory_llm_as_judge` takes the same parameters as [`create_llm_as_judge`](https://github.com/langchain-ai/openevals?tab=readme-ov-file#llm-as-judge) in `openevals`, so you can customize the prompt and scoring output as needed.
710
361
 
@@ -715,25 +366,6 @@ In addition to `prompt` and `model`, the following parameters are also available
715
366
  - `system`: a string that sets a system prompt for the judge model by adding a system message before other parts of the prompt.
716
367
  - `few_shot_examples`: a list of example dicts that are appended to the end of the prompt. This is useful for providing the judge model with examples of good and bad outputs. The required structure looks like this:
717
368
 
718
- <details>
719
- <summary>Python</summary>
720
-
721
- ```python
722
- few_shot_examples = [
723
- {
724
- "inputs": "What color is the sky?",
725
- "outputs": "The sky is red.",
726
- "reasoning": "The sky is red because it is early evening.",
727
- "score": 1,
728
- }
729
- ]
730
- ```
731
-
732
- </details>
733
-
734
- <details open>
735
- <summary>TypeScript</summary>
736
-
737
369
  ```ts
738
370
  const fewShotExamples = [
739
371
  {
@@ -744,7 +376,6 @@ const fewShotExamples = [
744
376
  }
745
377
  ];
746
378
  ```
747
- </details>
748
379
 
749
380
  See the [`openevals`](https://github.com/langchain-ai/openevals?tab=readme-ov-file#llm-as-judge) repo for a fully up to date list of parameters.
750
381
 
@@ -754,70 +385,78 @@ For frameworks like [LangGraph](https://github.com/langchain-ai/langgraph) that
754
385
 
755
386
  The below examples will use LangGraph with the built-in formatting utility, but graph evaluators accept input in the following general format:
756
387
 
757
- ```python
758
- class GraphTrajectory(TypedDict):
759
- # Only set when specifying reference_outputs
760
- inputs: Optional[list[dict]]
761
- results: list[dict]
762
- steps: list[list[str]]
763
-
764
- def evaluator(
765
- *,
766
- inputs: Optional[Union[dict, list]] = None,
767
- outputs: GraphTrajectory,
768
- reference_outputs: Optional[GraphTrajectory] = None,
769
- ) -> ...
388
+ ```ts
389
+ export type GraphTrajectory = {
390
+ inputs?: (Record<string, unknown> | null)[];
391
+ results: Record<string, unknown>[];
392
+ steps: string[][];
393
+ };
394
+
395
+ const evaluator: ({ inputs, outputs, referenceOutputs, ...extra }: {
396
+ inputs: (string | Record<string, unknown> | null)[] | {
397
+ inputs: (string | Record<string, unknown> | null)[];
398
+ };
399
+ outputs: GraphTrajectory;
400
+ referenceOutputs?: GraphTrajectory;
401
+ [key: string]: unknown;
402
+ }) => ...
770
403
  ```
771
404
 
772
405
  Where `inputs` is a list of inputs (or a dict with a key named `"inputs"`) to the graph whose items each represent the start of a new invocation in a thread, `results` representing the final output from each turn in the thread, and `steps` representing the internal steps taken for each turn.
773
406
 
774
407
  #### Graph trajectory LLM-as-judge
775
408
 
776
- This evaluator is similar to the `trajectory_llm_as_judge` evaluator, but it works with graph trajectories instead of message trajectories. Below, we set up a LangGraph agent, extract a trajectory from it using the built-in utils, and pass it to the evaluator:
777
-
778
- ```python
779
- from agentevals.graph_trajectory.utils import (
780
- extract_langgraph_trajectory_from_thread,
781
- )
782
- from agentevals.graph_trajectory.llm import create_graph_trajectory_llm_as_judge
783
-
784
- from langgraph.prebuilt import create_react_agent
785
- from langgraph.checkpoint.memory import MemorySaver
786
- from langgraph.types import Command, interrupt
787
-
788
- from langchain_core.tools import tool
789
-
790
- @tool
791
- def search(query: str):
792
- """Call to surf the web."""
793
- user_answer = interrupt("Tell me the answer to the question.")
794
- return user_answer
409
+ This evaluator is similar to the `trajectory_llm_as_judge` evaluator, but it works with graph trajectories instead of message trajectories. Below, we set up a LangGraph agent, extract a trajectory from it using the built-in utils, and pass it to the evaluator. First, let's setup our graph, call it, and then extract the trajectory:
795
410
 
796
- tools = [search]
411
+ ```ts
412
+ import { tool } from "@langchain/core/tools";
413
+ import { ChatOpenAI } from "@langchain/openai";
414
+ import { createReactAgent } from "@langchain/langgraph/prebuilt";
415
+ import { MemorySaver, interrupt } from "@langchain/langgraph";
416
+ import { z } from "zod";
417
+ import { extractLangGraphTrajectoryFromThread } from "agentevals";
418
+
419
+ const search = tool((_): string => {
420
+ const userAnswer = interrupt("Tell me the answer to the question.")
421
+ return userAnswer;
422
+ }, {
423
+ name: "search",
424
+ description: "Call to surf the web.",
425
+ schema: z.object({
426
+ query: z.string()
427
+ })
428
+ })
429
+
430
+ const tools = [search];
431
+
432
+ // Create a checkpointer
433
+ const checkpointer = new MemorySaver();
434
+
435
+ // Create the React agent
436
+ const graph = createReactAgent({
437
+ llm: new ChatOpenAI({ model: "gpt-4o-mini" }),
438
+ tools,
439
+ checkpointer,
440
+ });
797
441
 
798
- checkpointer = MemorySaver()
799
- graph = create_react_agent(
800
- model="gpt-4o-mini",
801
- checkpointer=checkpointer,
802
- tools=[search],
803
- )
442
+ // Invoke the graph with initial message
443
+ await graph.invoke(
444
+ { messages: [{ role: "user", content: "what's the weather in sf?" }] },
445
+ { configurable: { thread_id: "1" } }
446
+ );
804
447
 
805
- graph.invoke(
806
- {"messages": [{"role": "user", "content": "what's the weather in sf?"}]},
807
- config={"configurable": {"thread_id": "1"}},
808
- )
809
- # Resume the agent with a new command, simulating a human-in-the-loop workflow
810
- graph.invoke(
811
- Command(resume="It is rainy and 70 degrees!"),
812
- config={"configurable": {"thread_id": "1"}},
813
- )
448
+ // Resume the agent with a new command (simulating human-in-the-loop)
449
+ await graph.invoke(
450
+ { messages: [{ role: "user", content: "It is rainy and 70 degrees!" }] },
451
+ { configurable: { thread_id: "1" } }
452
+ );
814
453
 
815
- # Extract the trajectory from the first two thread runs
816
- extracted_trajectory = extract_langgraph_trajectory_from_thread(
817
- graph, {"configurable": {"thread_id": "1"}}
818
- )
454
+ const extractedTrajectory = await extractLangGraphTrajectoryFromThread(
455
+ graph,
456
+ { configurable: { thread_id: "1" } },
457
+ );
819
458
 
820
- print(extracted_trajectory)
459
+ console.log(extractedTrajectory);
821
460
  ```
822
461
 
823
462
  ```
@@ -850,17 +489,21 @@ print(extracted_trajectory)
850
489
  }
851
490
  ```
852
491
 
853
- ```python
854
- graph_trajectory_evaluator = create_graph_trajectory_llm_as_judge(
855
- model="openai:o3-mini",
856
- )
492
+ Now, we can pass the extracted trajectory to the evaluator:
493
+
494
+ ```ts
495
+ import { createGraphTrajectoryLLMAsJudge } from "agentevals";
496
+
497
+ const graphTrajectoryEvaluator = createGraphTrajectoryLLMAsJudge({
498
+ model: "openai:o3-mini",
499
+ })
857
500
 
858
- res = graph_trajectory_evaluator(
859
- inputs=extracted_trajectory["inputs"],
860
- outputs=extracted_trajectory["outputs"],
501
+ const res = await graphTrajectoryEvaluator(
502
+ inputs=extractedTrajectory.inputs,
503
+ outputs=extractedTrajectory.outputs,
861
504
  )
862
505
 
863
- print(res)
506
+ console.log(res);
864
507
  ```
865
508
 
866
509
  ```
@@ -871,10 +514,10 @@ print(res)
871
514
  }
872
515
  ```
873
516
 
874
- Note that though this evaluator takes the typical `inputs`, `outputs`, and `reference_outputs` parameters, it internally combines `inputs` and `outputs` to form a `thread`. Therefore, if you want to customize the prompt, your prompt should also contain a `thread` input variable:
517
+ Note that though this evaluator takes the typical `inputs`, `outputs`, and `referenceOutputs` parameters, it internally combines `inputs` and `outputs` to form a `thread`. Therefore, if you want to customize the prompt, your prompt should also contain a `thread` input variable:
875
518
 
876
- ```python
877
- CUSTOM_PROMPT = """You are an expert data labeler.
519
+ ```ts
520
+ const CUSTOM_PROMPT = `You are an expert data labeler.
878
521
  Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries.
879
522
 
880
523
  <Rubric>
@@ -896,82 +539,85 @@ Your task is to grade the accuracy of an AI agent's internal steps in resolving
896
539
  </thread>
897
540
 
898
541
  {reference_outputs}
899
- """
542
+ `
900
543
 
901
- evaluator = create_graph_trajectory_llm_as_judge(
902
- prompt=CUSTOM_PROMPT,
903
- model="openai:o3-mini",
904
- )
905
- res = await evaluator(
906
- inputs=extracted_trajectory["inputs"],
907
- outputs=extracted_trajectory["outputs"],
908
-
544
+ const graphTrajectoryEvaluator = createGraphTrajectoryLLMAsJudge({
545
+ prompt: CUSTOM_PROMPT,
546
+ model: "openai:o3-mini",
547
+ })
548
+ res = await graphTrajectoryEvaluator(
549
+ inputs=extractedTrajectory.inputs,
550
+ outputs=extractedTrajectory.outputs,
909
551
  )
910
552
  ```
911
553
 
912
- In order to format them properly into the prompt, `reference_outputs` should be passed in as a `GraphTrajectory` object like `outputs`.
554
+ In order to format them properly into the prompt, `referenceOutputs` should be passed in as a `GraphTrajectory` object like `outputs`.
913
555
 
914
556
  Also note that like other LLM-as-judge evaluators, you can pass extra kwargs into the evaluator to format them into the prompt.
915
557
 
916
558
  #### Graph trajectory strict match
917
559
 
918
- The `graph_trajectory_strict_match` evaluator is a simple evaluator that checks if the steps in the provided graph trajectory match the reference trajectory exactly.
560
+ The `graphTrajectoryStrictMatch` evaluator is a simple evaluator that checks if the steps in the provided graph trajectory match the reference trajectory exactly.
919
561
 
920
- ```python
921
- from agentevals.graph_trajectory.utils import (
922
- extract_langgraph_trajectory_from_thread,
923
- )
924
- from agentevals.graph_trajectory.strict import graph_trajectory_strict_match
925
-
926
-
927
- from langgraph.prebuilt import create_react_agent
928
- from langgraph.checkpoint.memory import MemorySaver
929
- from langgraph.types import Command, interrupt
930
-
931
- from langchain_core.tools import tool
932
-
933
- @tool
934
- def search(query: str):
935
- """Call to surf the web."""
936
- user_answer = interrupt("Tell me the answer to the question.")
937
- return user_answer
938
-
939
- tools = [search]
562
+ ```ts
563
+ import { tool } from "@langchain/core/tools";
564
+ import { ChatOpenAI } from "@langchain/openai";
565
+ import { createReactAgent } from "@langchain/langgraph/prebuilt";
566
+ import { MemorySaver, interrupt } from "@langchain/langgraph";
567
+ import { z } from "zod";
568
+ import { extractLangGraphTrajectoryFromThread, graphTrajectoryStrictMatch } from "agentevals";
569
+
570
+ const search = tool((_): string => {
571
+ const userAnswer = interrupt("Tell me the answer to the question.")
572
+ return userAnswer;
573
+ }, {
574
+ name: "search",
575
+ description: "Call to surf the web.",
576
+ schema: z.object({
577
+ query: z.string()
578
+ })
579
+ })
580
+
581
+ const tools = [search];
582
+
583
+ // Create a checkpointer
584
+ const checkpointer = new MemorySaver();
585
+
586
+ // Create the React agent
587
+ const graph = createReactAgent({
588
+ llm: new ChatOpenAI({ model: "gpt-4o-mini" }),
589
+ tools,
590
+ checkpointer,
591
+ });
940
592
 
941
- checkpointer = MemorySaver()
942
- graph = create_react_agent(
943
- model="gpt-4o-mini",
944
- checkpointer=checkpointer,
945
- tools=[search],
946
- )
593
+ // Invoke the graph with initial message
594
+ await graph.invoke(
595
+ { messages: [{ role: "user", content: "what's the weather in sf?" }] },
596
+ { configurable: { thread_id: "1" } }
597
+ );
947
598
 
948
- graph.invoke(
949
- {"messages": [{"role": "user", "content": "what's the weather in sf?"}]},
950
- config={"configurable": {"thread_id": "1"}},
951
- )
952
- # Resume the agent with a new command, simulating a human-in-the-loop workflow
953
- graph.invoke(
954
- Command(resume="It is rainy and 70 degrees!"),
955
- config={"configurable": {"thread_id": "1"}},
956
- )
599
+ // Resume the agent with a new command (simulating human-in-the-loop)
600
+ await graph.invoke(
601
+ { messages: [{ role: "user", content: "It is rainy and 70 degrees!" }] },
602
+ { configurable: { thread_id: "1" } }
603
+ );
957
604
 
958
- # Extract the trajectory from the first two thread runs
959
- extracted_trajectory = extract_langgraph_trajectory_from_thread(
960
- graph, {"configurable": {"thread_id": "1"}}
961
- )
605
+ const extractedTrajectory = await extractLangGraphTrajectoryFromThread(
606
+ graph,
607
+ { configurable: { thread_id: "1" } },
608
+ );
962
609
 
963
- reference_trajectory = {
964
- # not used for strict match
965
- "results": [],
966
- "steps": [["__start__", "agent", "tools", "__interrupt__"], ["agent"]],
610
+ const referenceTrajectory = {
611
+ results: [],
612
+ steps: [["__start__", "agent", "tools", "__interrupt__"], ["agent"]],
967
613
  }
968
614
 
969
- res = graph_trajectory_strict_match(
970
- outputs=extracted_trajectory["outputs"],
971
- reference_outputs=reference_trajectory,
972
- )
615
+ const result = await graphTrajectoryStrictMatch({
616
+ outputs: trajectory.outputs,
617
+ referenceOutputs: referenceOutputs!,
618
+ });
973
619
 
974
- print(res)
620
+ console.log(result);
975
621
  ```
976
622
 
977
623
  ```
@@ -980,37 +626,6 @@ print(res)
980
626
  'score': True,
981
627
  }
982
628
  ```
983
-
984
- ## Python Async Support
985
-
986
- All `agentevals` evaluators support Python [asyncio](https://docs.python.org/3/library/asyncio.html). As a convention, evaluators that use a factory function will have `async` put immediately after `create_` in the function name (for example, `create_async_trajectory_llm_as_judge`), and evaluators used directly will end in `async` (e.g. `trajectory_strict_match_async`).
987
-
988
- Here's an example of how to use the `create_async_llm_as_judge` evaluator asynchronously:
989
-
990
- ```python
991
- from agentevals.trajectory.llm import create_async_trajectory_llm_as_judge
992
-
993
- evaluator = create_async_llm_as_judge(
994
- prompt="What is the weather in {inputs}?",
995
- )
996
-
997
- result = await evaluator(inputs="San Francisco")
998
- ```
999
-
1000
- If you are using the OpenAI client directly, remember to pass in `AsyncOpenAI` as the `judge` parameter:
1001
-
1002
- ```python
1003
- from openai import AsyncOpenAI
1004
-
1005
- evaluator = create_async_llm_as_judge(
1006
- prompt="What is the weather in {inputs}?",
1007
- judge=AsyncOpenAI(),
1008
- model="o3-mini",
1009
- )
1010
-
1011
- result = await evaluator(inputs="San Francisco")
1012
- ```
1013
-
1014
629
  ## LangSmith Integration
1015
630
 
1016
631
  For tracking experiments over time, you can log evaluator results to [LangSmith](https://smith.langchain.com/), a platform for building production-grade LLM applications that includes tracing, evaluation, and experimentation tools.
@@ -1019,7 +634,7 @@ LangSmith currently offers two ways to run evals. We'll give a quick example of
1019
634
 
1020
635
  ### Pytest or Vitest/Jest
1021
636
 
1022
- First, follow [these instructions](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest) to set up LangSmith's pytest runner,
637
+ First, follow [these instructions](https://docs.smith.langchain.com/evaluation/how_to_guides/vitest_jest) to set up LangSmith's Vitest/Jest runner,
1023
638
  setting appropriate environment variables:
1024
639
 
1025
640
  ```bash
@@ -1027,80 +642,6 @@ export LANGSMITH_API_KEY="your_langsmith_api_key"
1027
642
  export LANGSMITH_TRACING="true"
1028
643
  ```
1029
644
 
1030
- <details>
1031
- <summary>Python</summary>
1032
-
1033
- Then, set up a file named `test_trajectory.py` with the following contents:
1034
-
1035
- ```python
1036
- import pytest
1037
- import json
1038
-
1039
- from langsmith import testing as t
1040
-
1041
- from agentevals.trajectory.llm import create_trajectory_llm_as_judge
1042
-
1043
- trajectory_evaluator = create_trajectory_llm_as_judge(
1044
- model="openai:o3-mini",
1045
- )
1046
-
1047
- @pytest.mark.langsmith
1048
- def test_trajectory_accuracy():
1049
- outputs = [
1050
- {"role": "user", "content": "What is the weather in SF?"},
1051
- {
1052
- "role": "assistant",
1053
- "tool_calls": [
1054
- {
1055
- "function": {
1056
- "name": "get_weather",
1057
- "arguments": json.dumps({"city": "SF"}),
1058
- }
1059
- }
1060
- ],
1061
- },
1062
- {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
1063
- {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
1064
- ]
1065
- reference_outputs = [
1066
- {"role": "user", "content": "What is the weather in SF?"},
1067
- {
1068
- "role": "assistant",
1069
- "tool_calls": [
1070
- {
1071
- "function": {
1072
- "name": "get_weather",
1073
- "arguments": json.dumps({"city": "San Francisco"}),
1074
- }
1075
- }
1076
- ],
1077
- },
1078
- {"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."},
1079
- {"role": "assistant", "content": "The weather in SF is 80˚ and sunny."},
1080
- ]
1081
-
1082
- t.log_inputs({})
1083
- t.log_outputs({"messages": outputs})
1084
- t.log_reference_outputs({"messages": reference_outputs})
1085
-
1086
- trajectory_evaluator(
1087
- outputs=outputs,
1088
- reference_outputs=reference_outputs
1089
- )
1090
- ```
1091
-
1092
- Note that when creating the evaluator, we've added a `feedback_key` parameter. This will be used to name the feedback in LangSmith.
1093
-
1094
- Now, run the eval with pytest:
1095
-
1096
- ```bash
1097
- pytest test_trajectory.py --langsmith-output
1098
- ```
1099
-
1100
- </details>
1101
-
1102
- <details open>
1103
- <summary>TypeScript</summary>
1104
645
 
1105
646
  Then, set up a file named `test_trajectory.eval.ts` with the following contents:
1106
647
 
@@ -1176,7 +717,6 @@ Now, run the eval with your runner of choice:
1176
717
  vitest run test_trajectory.eval.ts
1177
718
  ```
1178
719
 
1179
- </details>
1180
720
 
1181
721
  Feedback from the prebuilt evaluator will be automatically logged in LangSmith as a table of results like this in your terminal:
1182
722
 
@@ -1190,51 +730,38 @@ And you should also see the results in the experiment view in LangSmith:
1190
730
 
1191
731
  Alternatively, you can [create a dataset in LangSmith](https://docs.smith.langchain.com/evaluation/concepts#dataset-curation) and use your created evaluators with LangSmith's [`evaluate`](https://docs.smith.langchain.com/evaluation#8-run-and-view-results) function:
1192
732
 
1193
- <details>
1194
- <summary>Python</summary>
1195
-
1196
- ```python
1197
- from langsmith import Client
1198
- from agentevals.trajectory.llm import create_trajectory_llm_as_judge
1199
-
1200
- client = Client()
1201
-
1202
- trajectory_evaluator = create_trajectory_llm_as_judge(
1203
- model="openai:o3-mini",
1204
- )
1205
-
1206
- experiment_results = client.evaluate(
1207
- # This is a dummy target function, replace with your actual LLM-based system
1208
- lambda inputs: "What color is the sky?",
1209
- data="Sample dataset",
1210
- evaluators=[
1211
- trajectory_evaluator
1212
- ]
1213
- )
1214
- ```
1215
-
1216
- </details>
1217
-
1218
- <details open>
1219
- <summary>TypeScript</summary>
1220
-
1221
733
  ```ts
1222
734
  import { evaluate } from "langsmith/evaluation";
1223
- import { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE } from "agentevals";
735
+ import { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT } from "agentevals";
1224
736
 
1225
737
  const trajectoryEvaluator = createTrajectoryLLMAsJudge({
1226
738
  model: "openai:o3-mini",
739
+ prompt: TRAJECTORY_ACCURACY_PROMPT
1227
740
  });
1228
741
 
1229
742
  await evaluate(
1230
- (inputs) => "What color is the sky?",
743
+ (inputs) => [
744
+ {role: "user", content: "What is the weather in SF?"},
745
+ {
746
+ role: "assistant",
747
+ tool_calls: [
748
+ {
749
+ function: {
750
+ name: "get_weather",
751
+ arguments: json.dumps({"city": "SF"}),
752
+ }
753
+ }
754
+ ],
755
+ },
756
+ {role: "tool", content: "It's 80 degrees and sunny in SF."},
757
+ {role: "assistant", content: "The weather in SF is 80 degrees and sunny."},
758
+ ],
1231
759
  {
1232
760
  data: datasetName,
1233
761
  evaluators: [trajectoryEvaluator],
1234
762
  }
1235
763
  );
1236
764
  ```
1237
- </details>
1238
765
 
1239
766
  ## Thank you!
1240
767
 
package/dist/utils.cjs CHANGED
@@ -3,7 +3,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports._runEvaluator = exports.processScore = exports._normalizeToOpenAIMessagesList = exports._convertToOpenAIMessage = void 0;
4
4
  const messages_1 = require("@langchain/core/messages");
5
5
  const openai_1 = require("@langchain/openai");
6
- const jestlike_1 = require("langsmith/utils/jestlike");
6
+ const utils_1 = require("openevals/utils");
7
7
  const _convertToOpenAIMessage = (message) => {
8
8
  if ((0, messages_1.isBaseMessage)(message)) {
9
9
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
@@ -51,39 +51,6 @@ const processScore = (_, value) => {
51
51
  };
52
52
  exports.processScore = processScore;
53
53
  const _runEvaluator = async (runName, scorer, feedbackKey, extra) => {
54
- const runScorer = async (params) => {
55
- let score = await scorer(params);
56
- let reasoning;
57
- const results = [];
58
- if (!Array.isArray(score) && typeof score === "object") {
59
- for (const [key, value] of Object.entries(score)) {
60
- const [keyScore, reasoning] = (0, exports.processScore)(key, value);
61
- results.push({ key, score: keyScore, comment: reasoning });
62
- }
63
- }
64
- else {
65
- if (Array.isArray(score)) {
66
- reasoning = score[1];
67
- score = score[0];
68
- }
69
- results.push({ key: feedbackKey, score, comment: reasoning });
70
- }
71
- if (results.length === 1) {
72
- return results[0];
73
- }
74
- else {
75
- return results;
76
- }
77
- };
78
- if ((0, jestlike_1.isInTestContext)()) {
79
- const res = await (0, jestlike_1.wrapEvaluator)(runScorer)(extra ?? {}, {
80
- name: runName,
81
- });
82
- return res;
83
- }
84
- else {
85
- const res = await runScorer(extra ?? {});
86
- return res;
87
- }
54
+ return (0, utils_1._runEvaluator)(runName, scorer, feedbackKey, extra, "agentevals");
88
55
  };
89
56
  exports._runEvaluator = _runEvaluator;
package/dist/utils.d.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import { BaseMessage } from "@langchain/core/messages";
2
- import { SimpleEvaluationResult } from "langsmith/utils/jestlike";
2
+ import { EvaluationResultType } from "openevals/utils";
3
3
  import { ChatCompletionMessage, MultiResultScorerReturnType, SingleResultScorerReturnType } from "./types.js";
4
4
  export declare const _convertToOpenAIMessage: (message: BaseMessage | ChatCompletionMessage) => ChatCompletionMessage;
5
5
  export declare const _normalizeToOpenAIMessagesList: (messages?: (BaseMessage | ChatCompletionMessage)[] | {
@@ -9,5 +9,4 @@ export declare const processScore: (_: string, value: boolean | number | {
9
9
  score: boolean | number;
10
10
  reasoning?: string;
11
11
  }) => readonly [number | boolean, string | undefined] | readonly [number | boolean];
12
- export type EvaluationResultType<O> = O extends MultiResultScorerReturnType ? SimpleEvaluationResult[] : SimpleEvaluationResult;
13
- export declare const _runEvaluator: <T extends Record<string, unknown>, O extends MultiResultScorerReturnType | SingleResultScorerReturnType | Promise<MultiResultScorerReturnType | SingleResultScorerReturnType>>(runName: string, scorer: (params: T) => O, feedbackKey: string, extra?: T | undefined) => Promise<EvaluationResultType<O>>;
12
+ export declare const _runEvaluator: <T extends Record<string, unknown>, O extends SingleResultScorerReturnType | MultiResultScorerReturnType | Promise<SingleResultScorerReturnType | MultiResultScorerReturnType>>(runName: string, scorer: (params: T) => O, feedbackKey: string, extra?: T | undefined) => Promise<EvaluationResultType<O>>;
package/dist/utils.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import { isBaseMessage } from "@langchain/core/messages";
2
2
  import { _convertMessagesToOpenAIParams } from "@langchain/openai";
3
- import { wrapEvaluator, isInTestContext, } from "langsmith/utils/jestlike";
3
+ import { _runEvaluator as baseRunEvaluator, } from "openevals/utils";
4
4
  export const _convertToOpenAIMessage = (message) => {
5
5
  if (isBaseMessage(message)) {
6
6
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
@@ -45,38 +45,5 @@ export const processScore = (_, value) => {
45
45
  return [value];
46
46
  };
47
47
  export const _runEvaluator = async (runName, scorer, feedbackKey, extra) => {
48
- const runScorer = async (params) => {
49
- let score = await scorer(params);
50
- let reasoning;
51
- const results = [];
52
- if (!Array.isArray(score) && typeof score === "object") {
53
- for (const [key, value] of Object.entries(score)) {
54
- const [keyScore, reasoning] = processScore(key, value);
55
- results.push({ key, score: keyScore, comment: reasoning });
56
- }
57
- }
58
- else {
59
- if (Array.isArray(score)) {
60
- reasoning = score[1];
61
- score = score[0];
62
- }
63
- results.push({ key: feedbackKey, score, comment: reasoning });
64
- }
65
- if (results.length === 1) {
66
- return results[0];
67
- }
68
- else {
69
- return results;
70
- }
71
- };
72
- if (isInTestContext()) {
73
- const res = await wrapEvaluator(runScorer)(extra ?? {}, {
74
- name: runName,
75
- });
76
- return res;
77
- }
78
- else {
79
- const res = await runScorer(extra ?? {});
80
- return res;
81
- }
48
+ return baseRunEvaluator(runName, scorer, feedbackKey, extra, "agentevals");
82
49
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentevals",
3
- "version": "0.0.1-rc.3",
3
+ "version": "0.0.2",
4
4
  "packageManager": "yarn@3.5.1",
5
5
  "type": "module",
6
6
  "scripts": {
@@ -16,8 +16,8 @@
16
16
  "dependencies": {
17
17
  "@langchain/openai": "^0.4.4",
18
18
  "langchain": "^0.3.18",
19
- "langsmith": "^0.3.10",
20
- "openevals": "^0.0.1-rc.8"
19
+ "langsmith": "^0.3.11",
20
+ "openevals": "^0.0.3"
21
21
  },
22
22
  "peerDependencies": {
23
23
  "@langchain/core": "^0.3.40",