npm - agentevals - Versions diffs - 0.0.1-rc.3 → 0.0.2 - Mend

agentevals 0.0.1-rc.3 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -13,21 +13,9 @@ If you are looking for more general evaluation tools, please check out the compa
 To get started, install `agentevals`:
-<details>
-<summary>Python</summary>
-```bash
-pip install agentevals
-```
-</details>
-<details open>
-<summary>TypeScript</summary>
 ```bash
 npm install agentevals @langchain/core
 ```
-</details>
 This quickstart will use an evaluator powered by OpenAI's `o3-mini` model to judge your results, so you'll need to set your OpenAI API key as an environment variable:
@@ -37,54 +25,6 @@ export OPENAI_API_KEY="your_openai_api_key"
 Once you've done this, you can run your first trajectory evaluator. We represent the agent's trajectory as a list of OpenAI-style messages:
-<details>
-<summary>Python</summary>
-```python
-from agentevals.trajectory.llm import create_trajectory_llm_as_judge, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE
-trajectory_evaluator = create_trajectory_llm_as_judge(
-    prompt=TRAJECTORY_ACCURACY_PROMPT,
-    model="openai:o3-mini",
-)
-# This is a fake trajectory, in reality you would run your agent to get a real trajectory
-outputs = [
-    {"role": "user", "content": "What is the weather in SF?"},
-    {
-        "role": "assistant",
-        "tool_calls": [
-            {
-                "function": {
-                    "name": "get_weather",
-                    "arguments": json.dumps({"city": "SF"}),
-                }
-            }
-        ],
-    },
-    {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
-    {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
-]
-eval_result = trajectory_evaluator(
-  outputs=outputs,
-)
-print(eval_result)
-```
-```
-{
-  'key': 'trajectory_accuracy',
-  'reasoning': 'The trajectory accurately follows the user's request for weather information in SF. Initially, the assistant recognizes the goal (providing weather details), then it efficiently makes a tool call to get the weather, and finally it communicates the result clearly. All steps demonstrate logical progression and efficiency. Thus, the score should be: true.',
-  'score': true
-}
-```
-</details>
-<details open>
-<summary>TypeScript</summary>
 ```ts
 import {
   createTrajectoryLLMAsJudge,
@@ -131,7 +71,6 @@ console.log(evalResult);
     comment: '...'
 }
 ```
-</details>
 You can see that despite the small difference in the final response and tool calls, the evaluator still returns a score of `true` since the overall trajectory is the same between the output and reference!
@@ -156,42 +95,18 @@ You can see that despite the small difference in the final response and tool cal
 You can install `agentevals` like this:
-<details>
-<summary>Python</summary>
-```bash
-pip install agentevals
-```
-</details>
-<details open>
-<summary>TypeScript</summary>
 ```bash
 npm install agentevals @langchain/core
 ```
-</details>
 For LLM-as-judge evaluators, you will also need an LLM client. By default, `agentevals` will use [LangChain chat model integrations](https://python.langchain.com/docs/integrations/chat/) and comes with `langchain_openai` installed by default. However, if you prefer, you may use the OpenAI client directly:
-<details>
-<summary>Python</summary>
-```bash
-pip install openai
-```
-</details>
-<details open>
-<summary>TypeScript</summary>
 ```bash
 npm install openai
 ```
-</details>
 It is also helpful to be familiar with some [evaluation concepts](https://docs.smith.langchain.com/evaluation/concepts) and
-LangSmith's pytest integration for running evals, which is documented [here](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest).
+LangSmith's Vitest/Jest integration for running evals, which is documented [here](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest).
 ## Evaluators
@@ -207,64 +122,6 @@ The `trajectory_strict_match` evaluator, compares two trajectories and ensures t
 in the same order with the same tool calls. It allows for differences in message content and tool call arguments,
 but requires that the selected tools at each step are the same.
-<details>
-<summary>Python</summary>
-```python
-import json
-from agentevals.trajectory.strict import trajectory_strict_match
-outputs = [
-    {"role": "user", "content": "What is the weather in SF?"},
-    {
-        "role": "assistant",
-        "tool_calls": [
-            {
-                "function": {
-                    "name": "get_weather",
-                    "arguments": json.dumps({"city": "SF"}),
-                }
-            }
-        ],
-    },
-    {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
-    {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
-]
-reference_outputs = [
-    {"role": "user", "content": "What is the weather in San Francisco?"},
-    {
-        "role": "assistant",
-        "tool_calls": [
-            {
-                "function": {
-                    "name": "get_weather",
-                    "arguments": json.dumps({"city": "San Francisco"}),
-                }
-            }
-        ],
-    },
-    {"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."},
-    {"role": "assistant", "content": "The weather in SF is 80˚ and sunny."},
-]
-result = trajectory_strict_match(
-    outputs=outputs, reference_outputs=reference_outputs
-)
-print(result)
-```
-```
-{
-    'key': 'trajectory_accuracy',
-    'score': True,
-    'comment': None,
-}
-```
-</details>
-<details open>
-<summary>TypeScript</summary>
 ```ts
 import { trajectoryStrictMatch } from "agentevals";
@@ -300,86 +157,11 @@ console.log(result);
     'score': true,
 }
 ```
-</details>
 #### Unordered match
 The `trajectory_unordered_match` evaluator, compares two trajectories and ensures that they contain the same number of tool calls in any order. This is useful if you want to allow flexibility in how an agent obtains the proper information, but still do care that all information was retrieved.
-<details>
-<summary>Python</summary>
-```python
-import json
-from agentevals.trajectory.unordered import trajectory_unordered_match
-inputs = {}
-outputs = [
-    {"role": "user", "content": "What is the weather in SF and is there anything fun happening?"},
-    {
-        "role": "assistant",
-        "tool_calls": [{
-            "function": {
-                "name": "get_weather",
-                "arguments": json.dumps({"city": "SF"}),
-            }
-        }],
-    },
-    {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
-    {
-        "role": "assistant",
-        "tool_calls": [{
-            "function": {
-                "name": "get_fun_activities",
-                "arguments": json.dumps({"city": "SF"}),
-            }
-        }],
-    },
-    {"role": "tool", "content": "Nothing fun is happening, you should stay indoors and read!"},
-    {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny, but there is nothing fun happening."},
-]
-reference_outputs = [
-    {"role": "user", "content": "What is the weather in SF and is there anything fun happening?"},
-    {
-        "role": "assistant",
-        "tool_calls": [
-            {
-                "function": {
-                    "name": "get_fun_activities",
-                    "arguments": json.dumps({"city": "San Francisco"}),
-                }
-            },
-            {
-                "function": {
-                    "name": "get_weather",
-                    "arguments": json.dumps({"city": "San Francisco"}),
-                }
-            },
-        ],
-    },
-    {"role": "tool", "content": "Nothing fun is happening, you should stay indoors and read!"},
-    {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
-    {"role": "assistant", "content": "In SF, it's 80˚ and sunny, but there is nothing fun happening."},
-]
-result = trajectory_unordered_match(
-    outputs=outputs, reference_outputs=reference_outputs
-)
-print(result)
-```
-```
-{
-    'key': 'trajectory_unordered_match',
-    'score': True,
-    'comment': None,
-}
-```
-</details>
-<details open>
-<summary>TypeScript</summary>
 ```ts
 import { trajectoryUnorderedMatch } from "agentevals";
@@ -446,77 +228,11 @@ console.log(result)
     'score': true,
 }
 ```
-</details>
 #### Subset and superset match
 There are other evaluators for checking partial trajectory matches (ensuring that a trajectory contains a subset and superset of tool calls compared to a reference trajectory).
-<details>
-<summary>Python</summary>
-```python
-import json
-from openevals.trajectory.subset import trajectory_subset
-# from openevals.trajectory.superset import trajectory_superset
-outputs = [
-    {"role": "user", "content": "What is the weather in SF and London?"},
-    {
-        "role": "assistant",
-        "tool_calls": [{
-            "function": {
-                "name": "get_weather",
-                "arguments": json.dumps({"city": "SF and London"}),
-            }
-        }],
-    },
-    {"role": "tool", "content": "It's 80 degrees and sunny in SF, and 90 degrees and rainy in London."},
-    {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny. In London, it's 90 degrees and rainy."},
-]
-reference_outputs = [
-    {"role": "user", "content": "What is the weather in SF and London?"},
-    {
-        "role": "assistant",
-        "tool_calls": [
-            {
-                "function": {
-                    "name": "get_weather",
-                    "arguments": json.dumps({"city": "San Francisco"}),
-                }
-            },
-            {
-                "function": {
-                    "name": "get_weather",
-                    "arguments": json.dumps({"city": "London"}),
-                }
-            },
-        ],
-    },
-    {"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."},
-    {"role": "tool", "content": "It's 90 degrees and rainy in London."},
-    {"role": "assistant", "content": "The weather in SF is 80˚ and sunny. In London, it's 90˚ and rainy."},
-]
-result = trajectory_subset(
-    outputs=outputs, reference_outputs=reference_outputs
-)
-print(result)
-```
-```
-{
-    'key': 'trajectory_subset',
-    'score': True,
-    'comment': None,
-}
-```
-</details>
-<details open>
-<summary>TypeScript</summary>
 ```ts
 import { trajectorySubset } from "agentevals";
 // import { trajectorySuperset } from "agentevals";
@@ -574,7 +290,6 @@ console.log(result)
     'score': true,
 }
 ```
-</details>
 #### Trajectory LLM-as-judge
@@ -582,69 +297,6 @@ The LLM-as-judge trajectory evaluator that uses an LLM to evaluate the trajector
 and supports
 This allows for more flexibility in the trajectory comparison:
-<details>
-<summary>Python</summary>
-```python
-import json
-from openevals.trajectory.llm import create_trajectory_llm_as_judge, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE
-evaluator = create_trajectory_llm_as_judge(
-  prompt=TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,
-  model="openai:o3-mini"
-)
-outputs = [
-    {"role": "user", "content": "What is the weather in SF?"},
-    {
-        "role": "assistant",
-        "tool_calls": [
-            {
-                "function": {
-                    "name": "get_weather",
-                    "arguments": json.dumps({"city": "SF"}),
-                }
-            }
-        ],
-    },
-    {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
-    {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
-]
-reference_outputs = [
-    {"role": "user", "content": "What is the weather in SF?"},
-    {
-        "role": "assistant",
-        "tool_calls": [
-            {
-                "function": {
-                    "name": "get_weather",
-                    "arguments": json.dumps({"city": "San Francisco"}),
-                }
-            }
-        ],
-    },
-    {"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."},
-    {"role": "assistant", "content": "The weather in SF is 80˚ and sunny."},
-]
-eval_result = evaluator(
-    outputs=outputs,
-    reference_outputs=reference_outputs,
-)
-print(eval_result)
-```
-```
-{
-    'key': 'trajectory_accuracy',
-    'score': True,
-    'comment': 'The provided agent trajectory is consistent with the reference. Both trajectories start with the same user query and then correctly invoke a weather lookup through a tool call. Although the reference uses "San Francisco" while the provided trajectory uses "SF" and there is a minor formatting difference (degrees vs. ˚), these differences do not affect the correctness or essential steps of the process. Thus, the score should be: true.'
-}
-```
-</details>
-<details open>
-<summary>TypeScript</summary>
 ```ts
 import {
   createTrajectoryLLMAsJudge,
@@ -704,7 +356,6 @@ console.log(result)
     'comment': 'The provided agent trajectory is consistent with the reference. Both trajectories start with the same user query and then correctly invoke a weather lookup through a tool call. Although the reference uses "San Francisco" while the provided trajectory uses "SF" and there is a minor formatting difference (degrees vs. ˚), these differences do not affect the correctness or essential steps of the process. Thus, the score should be: true.'
 }
 ```
-</details>
 `create_trajectory_llm_as_judge` takes the same parameters as [`create_llm_as_judge`](https://github.com/langchain-ai/openevals?tab=readme-ov-file#llm-as-judge) in `openevals`, so you can customize the prompt and scoring output as needed.
@@ -715,25 +366,6 @@ In addition to `prompt` and `model`, the following parameters are also available
 - `system`: a string that sets a system prompt for the judge model by adding a system message before other parts of the prompt.
 - `few_shot_examples`: a list of example dicts that are appended to the end of the prompt. This is useful for providing the judge model with examples of good and bad outputs. The required structure looks like this:
-<details>
-<summary>Python</summary>
-```python
-few_shot_examples = [
-    {
-        "inputs": "What color is the sky?",
-        "outputs": "The sky is red.",
-        "reasoning": "The sky is red because it is early evening.",
-        "score": 1,
-    }
-]
-```
-</details>
-<details open>
-<summary>TypeScript</summary>
 ```ts
 const fewShotExamples = [
   {
@@ -744,7 +376,6 @@ const fewShotExamples = [
   }
 ];
 ```
-</details>
 See the [`openevals`](https://github.com/langchain-ai/openevals?tab=readme-ov-file#llm-as-judge) repo for a fully up to date list of parameters.
@@ -754,70 +385,78 @@ For frameworks like [LangGraph](https://github.com/langchain-ai/langgraph) that
 The below examples will use LangGraph with the built-in formatting utility, but graph evaluators accept input in the following general format:
-```python
-class GraphTrajectory(TypedDict):
-    # Only set when specifying reference_outputs
-    inputs: Optional[list[dict]]
-    results: list[dict]
-    steps: list[list[str]]
-def evaluator(
-    *,
-    inputs: Optional[Union[dict, list]] = None,
-    outputs: GraphTrajectory,
-    reference_outputs: Optional[GraphTrajectory] = None,
-) -> ...
+```ts
+export type GraphTrajectory = {
+  inputs?: (Record<string, unknown> | null)[];
+  results: Record<string, unknown>[];
+  steps: string[][];
+};
+const evaluator: ({ inputs, outputs, referenceOutputs, ...extra }: {
+    inputs: (string | Record<string, unknown> | null)[] | {
+        inputs: (string | Record<string, unknown> | null)[];
+    };
+    outputs: GraphTrajectory;
+    referenceOutputs?: GraphTrajectory;
+    [key: string]: unknown;
+}) => ...
 ```
 Where `inputs` is a list of inputs (or a dict with a key named `"inputs"`) to the graph whose items each represent the start of a new invocation in a thread, `results` representing the final output from each turn in the thread, and `steps` representing the internal steps taken for each turn.
 #### Graph trajectory LLM-as-judge
-This evaluator is similar to the `trajectory_llm_as_judge` evaluator, but it works with graph trajectories instead of message trajectories. Below, we set up a LangGraph agent, extract a trajectory from it using the built-in utils, and pass it to the evaluator:
-```python
-from agentevals.graph_trajectory.utils import (
-    extract_langgraph_trajectory_from_thread,
-)
-from agentevals.graph_trajectory.llm import create_graph_trajectory_llm_as_judge
-from langgraph.prebuilt import create_react_agent
-from langgraph.checkpoint.memory import MemorySaver
-from langgraph.types import Command, interrupt
-from langchain_core.tools import tool
-@tool
-def search(query: str):
-    """Call to surf the web."""
-    user_answer = interrupt("Tell me the answer to the question.")
-    return user_answer
+This evaluator is similar to the `trajectory_llm_as_judge` evaluator, but it works with graph trajectories instead of message trajectories. Below, we set up a LangGraph agent, extract a trajectory from it using the built-in utils, and pass it to the evaluator. First, let's setup our graph, call it, and then extract the trajectory:
-tools = [search]
+```ts
+import { tool } from "@langchain/core/tools";
+import { ChatOpenAI } from "@langchain/openai";
+import { createReactAgent } from "@langchain/langgraph/prebuilt";
+import { MemorySaver, interrupt } from "@langchain/langgraph";
+import { z } from "zod";
+import { extractLangGraphTrajectoryFromThread } from "agentevals";
+const search = tool((_): string => {
+  const userAnswer = interrupt("Tell me the answer to the question.")
+  return userAnswer;
+}, {
+  name: "search",
+  description: "Call to surf the web.",
+  schema: z.object({
+      query: z.string()
+  })
+})
+const tools = [search];
+// Create a checkpointer
+const checkpointer = new MemorySaver();
+// Create the React agent
+const graph = createReactAgent({
+  llm: new ChatOpenAI({ model: "gpt-4o-mini" }),
+  tools,
+  checkpointer,
+});
-checkpointer = MemorySaver()
-graph = create_react_agent(
-    model="gpt-4o-mini",
-    checkpointer=checkpointer,
-    tools=[search],
-)
+// Invoke the graph with initial message
+await graph.invoke(
+  { messages: [{ role: "user", content: "what's the weather in sf?" }] },
+  { configurable: { thread_id: "1" } }
+);
-graph.invoke(
-    {"messages": [{"role": "user", "content": "what's the weather in sf?"}]},
-    config={"configurable": {"thread_id": "1"}},
-)
-# Resume the agent with a new command, simulating a human-in-the-loop workflow
-graph.invoke(
-    Command(resume="It is rainy and 70 degrees!"),
-    config={"configurable": {"thread_id": "1"}},
-)
+// Resume the agent with a new command (simulating human-in-the-loop)
+await graph.invoke(
+  { messages: [{ role: "user", content: "It is rainy and 70 degrees!" }] },
+  { configurable: { thread_id: "1" } }
+);
-# Extract the trajectory from the first two thread runs
-extracted_trajectory = extract_langgraph_trajectory_from_thread(
-    graph, {"configurable": {"thread_id": "1"}}
-)
+const extractedTrajectory = await extractLangGraphTrajectoryFromThread(
+  graph,
+  { configurable: { thread_id: "1" } },
+);
-print(extracted_trajectory)
+console.log(extractedTrajectory);
 ```
 ```
@@ -850,17 +489,21 @@ print(extracted_trajectory)
 }
 ```
-```python
-graph_trajectory_evaluator = create_graph_trajectory_llm_as_judge(
-    model="openai:o3-mini",
-)
+Now, we can pass the extracted trajectory to the evaluator:
+```ts
+import { createGraphTrajectoryLLMAsJudge } from "agentevals";
+const graphTrajectoryEvaluator = createGraphTrajectoryLLMAsJudge({
+    model: "openai:o3-mini",
+})
-res = graph_trajectory_evaluator(
-    inputs=extracted_trajectory["inputs"],
-    outputs=extracted_trajectory["outputs"],
+const res = await graphTrajectoryEvaluator(
+    inputs=extractedTrajectory.inputs,
+    outputs=extractedTrajectory.outputs,
 )
-print(res)
+console.log(res);
 ```
 ```
@@ -871,10 +514,10 @@ print(res)
 }
 ```
-Note that though this evaluator takes the typical `inputs`, `outputs`, and `reference_outputs` parameters, it internally combines `inputs` and `outputs` to form a `thread`. Therefore, if you want to customize the prompt, your prompt should also contain a `thread` input variable:
+Note that though this evaluator takes the typical `inputs`, `outputs`, and `referenceOutputs` parameters, it internally combines `inputs` and `outputs` to form a `thread`. Therefore, if you want to customize the prompt, your prompt should also contain a `thread` input variable:
-```python
-CUSTOM_PROMPT = """You are an expert data labeler.
+```ts
+const CUSTOM_PROMPT = `You are an expert data labeler.
 Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries.
 <Rubric>
@@ -896,82 +539,85 @@ Your task is to grade the accuracy of an AI agent's internal steps in resolving
 </thread>
 {reference_outputs}
-"""
+`
-evaluator = create_graph_trajectory_llm_as_judge(
-    prompt=CUSTOM_PROMPT,
-    model="openai:o3-mini",
-)
-res = await evaluator(
-    inputs=extracted_trajectory["inputs"],
-    outputs=extracted_trajectory["outputs"],
+const graphTrajectoryEvaluator = createGraphTrajectoryLLMAsJudge({
+  prompt: CUSTOM_PROMPT,
+  model: "openai:o3-mini",
+})
+res = await graphTrajectoryEvaluator(
+  inputs=extractedTrajectory.inputs,
+  outputs=extractedTrajectory.outputs,
 )
 ```
-In order to format them properly into the prompt, `reference_outputs` should be passed in as a `GraphTrajectory` object like `outputs`.
+In order to format them properly into the prompt, `referenceOutputs` should be passed in as a `GraphTrajectory` object like `outputs`.
 Also note that like other LLM-as-judge evaluators, you can pass extra kwargs into the evaluator to format them into the prompt.
 #### Graph trajectory strict match
-The `graph_trajectory_strict_match` evaluator is a simple evaluator that checks if the steps in the provided graph trajectory match the reference trajectory exactly.
+The `graphTrajectoryStrictMatch` evaluator is a simple evaluator that checks if the steps in the provided graph trajectory match the reference trajectory exactly.
-```python
-from agentevals.graph_trajectory.utils import (
-    extract_langgraph_trajectory_from_thread,
-)
-from agentevals.graph_trajectory.strict import graph_trajectory_strict_match
-from langgraph.prebuilt import create_react_agent
-from langgraph.checkpoint.memory import MemorySaver
-from langgraph.types import Command, interrupt
-from langchain_core.tools import tool
-@tool
-def search(query: str):
-    """Call to surf the web."""
-    user_answer = interrupt("Tell me the answer to the question.")
-    return user_answer
-tools = [search]
+```ts
+import { tool } from "@langchain/core/tools";
+import { ChatOpenAI } from "@langchain/openai";
+import { createReactAgent } from "@langchain/langgraph/prebuilt";
+import { MemorySaver, interrupt } from "@langchain/langgraph";
+import { z } from "zod";
+import { extractLangGraphTrajectoryFromThread, graphTrajectoryStrictMatch } from "agentevals";
+const search = tool((_): string => {
+  const userAnswer = interrupt("Tell me the answer to the question.")
+  return userAnswer;
+}, {
+  name: "search",
+  description: "Call to surf the web.",
+  schema: z.object({
+      query: z.string()
+  })
+})
+const tools = [search];
+// Create a checkpointer
+const checkpointer = new MemorySaver();
+// Create the React agent
+const graph = createReactAgent({
+  llm: new ChatOpenAI({ model: "gpt-4o-mini" }),
+  tools,
+  checkpointer,
+});
-checkpointer = MemorySaver()
-graph = create_react_agent(
-    model="gpt-4o-mini",
-    checkpointer=checkpointer,
-    tools=[search],
-)
+// Invoke the graph with initial message
+await graph.invoke(
+  { messages: [{ role: "user", content: "what's the weather in sf?" }] },
+  { configurable: { thread_id: "1" } }
+);
-graph.invoke(
-    {"messages": [{"role": "user", "content": "what's the weather in sf?"}]},
-    config={"configurable": {"thread_id": "1"}},
-)
-# Resume the agent with a new command, simulating a human-in-the-loop workflow
-graph.invoke(
-    Command(resume="It is rainy and 70 degrees!"),
-    config={"configurable": {"thread_id": "1"}},
-)
+// Resume the agent with a new command (simulating human-in-the-loop)
+await graph.invoke(
+  { messages: [{ role: "user", content: "It is rainy and 70 degrees!" }] },
+  { configurable: { thread_id: "1" } }
+);
-# Extract the trajectory from the first two thread runs
-extracted_trajectory = extract_langgraph_trajectory_from_thread(
-    graph, {"configurable": {"thread_id": "1"}}
-)
+const extractedTrajectory = await extractLangGraphTrajectoryFromThread(
+  graph,
+  { configurable: { thread_id: "1" } },
+);
-reference_trajectory = {
-    # not used for strict match
-    "results": [],
-    "steps": [["__start__", "agent", "tools", "__interrupt__"], ["agent"]],
+const referenceTrajectory = {
+  results: [],
+  steps: [["__start__", "agent", "tools", "__interrupt__"], ["agent"]],
 }
-res = graph_trajectory_strict_match(
-    outputs=extracted_trajectory["outputs"],
-    reference_outputs=reference_trajectory,
-)
+const result = await graphTrajectoryStrictMatch({
+  outputs: trajectory.outputs,
+  referenceOutputs: referenceOutputs!,
+});
-print(res)
+console.log(result);
 ```
 ```
@@ -980,37 +626,6 @@ print(res)
   'score': True,
 }
 ```
-## Python Async Support
-All `agentevals` evaluators support Python [asyncio](https://docs.python.org/3/library/asyncio.html). As a convention, evaluators that use a factory function will have `async` put immediately after `create_` in the function name (for example, `create_async_trajectory_llm_as_judge`), and evaluators used directly will end in `async` (e.g. `trajectory_strict_match_async`).
-Here's an example of how to use the `create_async_llm_as_judge` evaluator asynchronously:
-```python
-from agentevals.trajectory.llm import create_async_trajectory_llm_as_judge
-evaluator = create_async_llm_as_judge(
-    prompt="What is the weather in {inputs}?",
-)
-result = await evaluator(inputs="San Francisco")
-```
-If you are using the OpenAI client directly, remember to pass in `AsyncOpenAI` as the `judge` parameter:
-```python
-from openai import AsyncOpenAI
-evaluator = create_async_llm_as_judge(
-    prompt="What is the weather in {inputs}?",
-    judge=AsyncOpenAI(),
-    model="o3-mini",
-)
-result = await evaluator(inputs="San Francisco")
-```
 ## LangSmith Integration
 For tracking experiments over time, you can log evaluator results to [LangSmith](https://smith.langchain.com/), a platform for building production-grade LLM applications that includes tracing, evaluation, and experimentation tools.
@@ -1019,7 +634,7 @@ LangSmith currently offers two ways to run evals. We'll give a quick example of
 ### Pytest or Vitest/Jest
-First, follow [these instructions](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest) to set up LangSmith's pytest runner,
+First, follow [these instructions](https://docs.smith.langchain.com/evaluation/how_to_guides/vitest_jest) to set up LangSmith's Vitest/Jest runner,
 setting appropriate environment variables:
 ```bash
@@ -1027,80 +642,6 @@ export LANGSMITH_API_KEY="your_langsmith_api_key"
 export LANGSMITH_TRACING="true"
 ```
-<details>
-<summary>Python</summary>
-Then, set up a file named `test_trajectory.py` with the following contents:
-```python
-import pytest
-import json
-from langsmith import testing as t
-from agentevals.trajectory.llm import create_trajectory_llm_as_judge
-trajectory_evaluator = create_trajectory_llm_as_judge(
-    model="openai:o3-mini",
-)
-@pytest.mark.langsmith
-def test_trajectory_accuracy():
-    outputs = [
-        {"role": "user", "content": "What is the weather in SF?"},
-        {
-            "role": "assistant",
-            "tool_calls": [
-                {
-                    "function": {
-                        "name": "get_weather",
-                        "arguments": json.dumps({"city": "SF"}),
-                    }
-                }
-            ],
-        },
-        {"role": "tool", "content": "It's 80 degrees and sunny in SF."},
-        {"role": "assistant", "content": "The weather in SF is 80 degrees and sunny."},
-    ]
-    reference_outputs = [
-        {"role": "user", "content": "What is the weather in SF?"},
-        {
-            "role": "assistant",
-            "tool_calls": [
-                {
-                    "function": {
-                        "name": "get_weather",
-                        "arguments": json.dumps({"city": "San Francisco"}),
-                    }
-                }
-            ],
-        },
-        {"role": "tool", "content": "It's 80 degrees and sunny in San Francisco."},
-        {"role": "assistant", "content": "The weather in SF is 80˚ and sunny."},
-    ]
-    t.log_inputs({})
-    t.log_outputs({"messages": outputs})
-    t.log_reference_outputs({"messages": reference_outputs})
-    trajectory_evaluator(
-      outputs=outputs,
-      reference_outputs=reference_outputs
-    )
-```
-Note that when creating the evaluator, we've added a `feedback_key` parameter. This will be used to name the feedback in LangSmith.
-Now, run the eval with pytest:
-```bash
-pytest test_trajectory.py --langsmith-output
-```
-</details>
-<details open>
-<summary>TypeScript</summary>
 Then, set up a file named `test_trajectory.eval.ts` with the following contents:
@@ -1176,7 +717,6 @@ Now, run the eval with your runner of choice:
 vitest run test_trajectory.eval.ts
 ```
-</details>
 Feedback from the prebuilt evaluator will be automatically logged in LangSmith as a table of results like this in your terminal:
@@ -1190,51 +730,38 @@ And you should also see the results in the experiment view in LangSmith:
 Alternatively, you can [create a dataset in LangSmith](https://docs.smith.langchain.com/evaluation/concepts#dataset-curation) and use your created evaluators with LangSmith's [`evaluate`](https://docs.smith.langchain.com/evaluation#8-run-and-view-results) function:
-<details>
-<summary>Python</summary>
-```python
-from langsmith import Client
-from agentevals.trajectory.llm import create_trajectory_llm_as_judge
-client = Client()
-trajectory_evaluator = create_trajectory_llm_as_judge(
-    model="openai:o3-mini",
-)
-experiment_results = client.evaluate(
-    # This is a dummy target function, replace with your actual LLM-based system
-    lambda inputs: "What color is the sky?",
-    data="Sample dataset",
-    evaluators=[
-        trajectory_evaluator
-    ]
-)
-```
-</details>
-<details open>
-<summary>TypeScript</summary>
 ```ts
 import { evaluate } from "langsmith/evaluation";
-import { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE } from "agentevals";
+import { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT } from "agentevals";
 const trajectoryEvaluator = createTrajectoryLLMAsJudge({
   model: "openai:o3-mini",
+  prompt: TRAJECTORY_ACCURACY_PROMPT
 });
 await evaluate(
-  (inputs) => "What color is the sky?",
+  (inputs) => [
+        {role: "user", content: "What is the weather in SF?"},
+        {
+            role: "assistant",
+            tool_calls: [
+                {
+                    function: {
+                        name: "get_weather",
+                        arguments: json.dumps({"city": "SF"}),
+                    }
+                }
+            ],
+        },
+        {role: "tool", content: "It's 80 degrees and sunny in SF."},
+        {role: "assistant", content: "The weather in SF is 80 degrees and sunny."},
+    ],
   {
     data: datasetName,
     evaluators: [trajectoryEvaluator],
   }
 );
 ```
-</details>
 ## Thank you!

package/dist/utils.cjs CHANGED Viewed

@@ -3,7 +3,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
 exports._runEvaluator = exports.processScore = exports._normalizeToOpenAIMessagesList = exports._convertToOpenAIMessage = void 0;
 const messages_1 = require("@langchain/core/messages");
 const openai_1 = require("@langchain/openai");
-const jestlike_1 = require("langsmith/utils/jestlike");
+const utils_1 = require("openevals/utils");
 const _convertToOpenAIMessage = (message) => {
     if ((0, messages_1.isBaseMessage)(message)) {
         // eslint-disable-next-line @typescript-eslint/no-explicit-any
@@ -51,39 +51,6 @@ const processScore = (_, value) => {
 };
 exports.processScore = processScore;
 const _runEvaluator = async (runName, scorer, feedbackKey, extra) => {
-    const runScorer = async (params) => {
-        let score = await scorer(params);
-        let reasoning;
-        const results = [];
-        if (!Array.isArray(score) && typeof score === "object") {
-            for (const [key, value] of Object.entries(score)) {
-                const [keyScore, reasoning] = (0, exports.processScore)(key, value);
-                results.push({ key, score: keyScore, comment: reasoning });
-            }
-        }
-        else {
-            if (Array.isArray(score)) {
-                reasoning = score[1];
-                score = score[0];
-            }
-            results.push({ key: feedbackKey, score, comment: reasoning });
-        }
-        if (results.length === 1) {
-            return results[0];
-        }
-        else {
-            return results;
-        }
-    };
-    if ((0, jestlike_1.isInTestContext)()) {
-        const res = await (0, jestlike_1.wrapEvaluator)(runScorer)(extra ?? {}, {
-            name: runName,
-        });
-        return res;
-    }
-    else {
-        const res = await runScorer(extra ?? {});
-        return res;
-    }
+    return (0, utils_1._runEvaluator)(runName, scorer, feedbackKey, extra, "agentevals");
 };
 exports._runEvaluator = _runEvaluator;

package/dist/utils.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { BaseMessage } from "@langchain/core/messages";
-import { SimpleEvaluationResult } from "langsmith/utils/jestlike";
+import { EvaluationResultType } from "openevals/utils";
 import { ChatCompletionMessage, MultiResultScorerReturnType, SingleResultScorerReturnType } from "./types.js";
 export declare const _convertToOpenAIMessage: (message: BaseMessage | ChatCompletionMessage) => ChatCompletionMessage;
 export declare const _normalizeToOpenAIMessagesList: (messages?: (BaseMessage | ChatCompletionMessage)[] | {
@@ -9,5 +9,4 @@ export declare const processScore: (_: string, value: boolean | number | {
     score: boolean | number;
     reasoning?: string;
 }) => readonly [number | boolean, string | undefined] | readonly [number | boolean];
-export type EvaluationResultType<O> = O extends MultiResultScorerReturnType ? SimpleEvaluationResult[] : SimpleEvaluationResult;
-export declare const _runEvaluator: <T extends Record<string, unknown>, O extends MultiResultScorerReturnType | SingleResultScorerReturnType | Promise<MultiResultScorerReturnType | SingleResultScorerReturnType>>(runName: string, scorer: (params: T) => O, feedbackKey: string, extra?: T | undefined) => Promise<EvaluationResultType<O>>;
+export declare const _runEvaluator: <T extends Record<string, unknown>, O extends SingleResultScorerReturnType | MultiResultScorerReturnType | Promise<SingleResultScorerReturnType | MultiResultScorerReturnType>>(runName: string, scorer: (params: T) => O, feedbackKey: string, extra?: T | undefined) => Promise<EvaluationResultType<O>>;

package/dist/utils.js CHANGED Viewed

@@ -1,6 +1,6 @@
 import { isBaseMessage } from "@langchain/core/messages";
 import { _convertMessagesToOpenAIParams } from "@langchain/openai";
-import { wrapEvaluator, isInTestContext, } from "langsmith/utils/jestlike";
+import { _runEvaluator as baseRunEvaluator, } from "openevals/utils";
 export const _convertToOpenAIMessage = (message) => {
     if (isBaseMessage(message)) {
         // eslint-disable-next-line @typescript-eslint/no-explicit-any
@@ -45,38 +45,5 @@ export const processScore = (_, value) => {
     return [value];
 };
 export const _runEvaluator = async (runName, scorer, feedbackKey, extra) => {
-    const runScorer = async (params) => {
-        let score = await scorer(params);
-        let reasoning;
-        const results = [];
-        if (!Array.isArray(score) && typeof score === "object") {
-            for (const [key, value] of Object.entries(score)) {
-                const [keyScore, reasoning] = processScore(key, value);
-                results.push({ key, score: keyScore, comment: reasoning });
-            }
-        }
-        else {
-            if (Array.isArray(score)) {
-                reasoning = score[1];
-                score = score[0];
-            }
-            results.push({ key: feedbackKey, score, comment: reasoning });
-        }
-        if (results.length === 1) {
-            return results[0];
-        }
-        else {
-            return results;
-        }
-    };
-    if (isInTestContext()) {
-        const res = await wrapEvaluator(runScorer)(extra ?? {}, {
-            name: runName,
-        });
-        return res;
-    }
-    else {
-        const res = await runScorer(extra ?? {});
-        return res;
-    }
+    return baseRunEvaluator(runName, scorer, feedbackKey, extra, "agentevals");
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "agentevals",
-  "version": "0.0.1-rc.3",
+  "version": "0.0.2",
   "packageManager": "yarn@3.5.1",
   "type": "module",
   "scripts": {
@@ -16,8 +16,8 @@
   "dependencies": {
     "@langchain/openai": "^0.4.4",
     "langchain": "^0.3.18",
-    "langsmith": "^0.3.10",
-    "openevals": "^0.0.1-rc.8"
+    "langsmith": "^0.3.11",
+    "openevals": "^0.0.3"
   },
   "peerDependencies": {
     "@langchain/core": "^0.3.40",