agentevals 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +141 -36
- package/dist/index.cjs +3 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.js +1 -0
- package/dist/trajectory/match.cjs +84 -0
- package/dist/trajectory/match.d.ts +61 -0
- package/dist/trajectory/match.js +80 -0
- package/dist/trajectory/strict.cjs +42 -42
- package/dist/trajectory/strict.d.ts +23 -2
- package/dist/trajectory/strict.js +40 -41
- package/dist/trajectory/subset.cjs +13 -9
- package/dist/trajectory/subset.d.ts +8 -1
- package/dist/trajectory/subset.js +11 -8
- package/dist/trajectory/superset.cjs +13 -9
- package/dist/trajectory/superset.d.ts +8 -1
- package/dist/trajectory/superset.js +11 -8
- package/dist/trajectory/unordered.cjs +14 -10
- package/dist/trajectory/unordered.d.ts +8 -1
- package/dist/trajectory/unordered.js +12 -9
- package/dist/trajectory/utils.cjs +107 -18
- package/dist/trajectory/utils.d.ts +3 -2
- package/dist/trajectory/utils.js +105 -17
- package/dist/types.d.ts +3 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -86,7 +86,6 @@ You can see that despite the small difference in the final response and tool cal
|
|
|
86
86
|
- [Graph Trajectory](#graph-trajectory)
|
|
87
87
|
- [Graph trajectory LLM-as-judge](#graph-trajectory-llm-as-judge)
|
|
88
88
|
- [Graph trajectory strict match](#graph-trajectory-strict-match)
|
|
89
|
-
- [Python Async Support](#python-async-support)
|
|
90
89
|
- [LangSmith Integration](#langsmith-integration)
|
|
91
90
|
- [Pytest or Vitest/Jest](#pytest-or-vitestjest)
|
|
92
91
|
- [Evaluate](#evaluate)
|
|
@@ -106,7 +105,7 @@ npm install openai
|
|
|
106
105
|
```
|
|
107
106
|
|
|
108
107
|
It is also helpful to be familiar with some [evaluation concepts](https://docs.smith.langchain.com/evaluation/concepts) and
|
|
109
|
-
LangSmith's
|
|
108
|
+
LangSmith's pytest integration for running evals, which is documented [here](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest).
|
|
110
109
|
|
|
111
110
|
## Evaluators
|
|
112
111
|
|
|
@@ -116,21 +115,105 @@ Agent trajectory evaluators are used to judge the trajectory of an agent's execu
|
|
|
116
115
|
These evaluators expect you to format your agent's trajectory as a list of OpenAI format dicts or as a list of LangChain `BaseMessage` classes, and handle message formatting
|
|
117
116
|
under the hood.
|
|
118
117
|
|
|
118
|
+
AgentEvals offers the `create_trajectory_match_evaluator`/`createTrajectoryMatchEvaluator` and `create_async_trajectory_match_evaluator` methods for this task.
|
|
119
|
+
|
|
120
|
+
#### Checking tool call equality
|
|
121
|
+
|
|
122
|
+
When checking equality between tool calls, these matchers will require that all tool call arguments are the same. You can configure this behavior to ignore tool call arguments by setting `tool_args_match_mode="ignore"` (Python) or `toolArgsMatchMode: "ignore"` (JS), or by only checking specific properties within the call using the `tool_args_match_overrides`/`toolArgsMatchOverrides` param.
|
|
123
|
+
|
|
124
|
+
`tool_args_match_overrides`/`toolArgsMatchOverrides` takes a dictionary whose keys are tool names and whose values are either `"exact"`, `"ignore"`, a list of fields within the tool call that must match exactly, or a comparator function that takes two arguments and returns whether they are equal:
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
ToolArgsMatchMode = Literal["exact", "ignore"]
|
|
128
|
+
|
|
129
|
+
ToolArgsMatchOverrides = dict[str, Union[ToolArgsMatchMode, list[str], Callable[[dict, dict], bool]]]
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Here's an example that allows case insensitivity for the arguments to a tool named `get_weather`:
|
|
133
|
+
|
|
134
|
+
```ts
|
|
135
|
+
import { createTrajectoryMatchEvaluator } from "agentevals";
|
|
136
|
+
|
|
137
|
+
const outputs = [
|
|
138
|
+
{ role: "user", content: "What is the weather in SF?" },
|
|
139
|
+
{
|
|
140
|
+
role: "assistant",
|
|
141
|
+
tool_calls: [{
|
|
142
|
+
function: {
|
|
143
|
+
name: "get_weather",
|
|
144
|
+
arguments: JSON.stringify({ city: "san francisco" })
|
|
145
|
+
},
|
|
146
|
+
}]
|
|
147
|
+
},
|
|
148
|
+
{ role: "tool", content: "It's 80 degrees and sunny in SF." },
|
|
149
|
+
{ role: "assistant", content: "The weather in SF is 80 degrees and sunny." },
|
|
150
|
+
];
|
|
151
|
+
|
|
152
|
+
const referenceOutputs = [
|
|
153
|
+
{ role: "user", content: "What is the weather in San Francisco?" },
|
|
154
|
+
{
|
|
155
|
+
role: "assistant",
|
|
156
|
+
tool_calls: [{
|
|
157
|
+
function: {
|
|
158
|
+
name: "get_weather",
|
|
159
|
+
arguments: JSON.stringify({ city: "San Francisco" })
|
|
160
|
+
}
|
|
161
|
+
}]
|
|
162
|
+
},
|
|
163
|
+
{ role: "tool", content: "It's 80 degrees and sunny in San Francisco." },
|
|
164
|
+
];
|
|
165
|
+
|
|
166
|
+
const evaluator = createTrajectoryMatchEvaluator({
|
|
167
|
+
trajectoryMatchMode: "strict",
|
|
168
|
+
toolArgsMatchMode: "exact", // Default value
|
|
169
|
+
toolArgsMatchOverrides: {
|
|
170
|
+
get_weather: (x, y) => {
|
|
171
|
+
return typeof x.city === "string" &&
|
|
172
|
+
typeof y.city === "string" &&
|
|
173
|
+
x.city.toLowerCase() === y.city.toLowerCase();
|
|
174
|
+
},
|
|
175
|
+
}
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
const result = await evaluator({
|
|
179
|
+
outputs,
|
|
180
|
+
referenceOutputs,
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
console.log(result);
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
```
|
|
187
|
+
{
|
|
188
|
+
'key': 'trajectory_strict_match',
|
|
189
|
+
'score': true,
|
|
190
|
+
}
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
This flexibility allows you to handle cases where you want looser equality for LLM generated arguments (`"san francisco"` to equal `"San Francisco"`) for only specific tool calls.
|
|
194
|
+
|
|
119
195
|
#### Strict match
|
|
120
196
|
|
|
121
|
-
The `
|
|
122
|
-
in the same order with the same tool calls.
|
|
123
|
-
but requires that the selected tools at each step are the same.
|
|
197
|
+
The `"strict"` `trajectory_match_mode` compares two trajectories and ensures that they contain the same messages
|
|
198
|
+
in the same order with the same tool calls. Note that it does allow for differences in message content:
|
|
124
199
|
|
|
125
200
|
```ts
|
|
126
|
-
import {
|
|
201
|
+
import { createTrajectoryMatchEvaluator } from "agentevals";
|
|
127
202
|
|
|
128
203
|
const outputs = [
|
|
129
204
|
{ role: "user", content: "What is the weather in SF?" },
|
|
130
205
|
{
|
|
131
206
|
role: "assistant",
|
|
132
207
|
tool_calls: [{
|
|
133
|
-
function: {
|
|
208
|
+
function: {
|
|
209
|
+
name: "get_weather",
|
|
210
|
+
arguments: JSON.stringify({ city: "San Francisco" })
|
|
211
|
+
},
|
|
212
|
+
}, {
|
|
213
|
+
function: {
|
|
214
|
+
name: "accuweather_forecast",
|
|
215
|
+
arguments: JSON.stringify({"city": "San Francisco"}),
|
|
216
|
+
},
|
|
134
217
|
}]
|
|
135
218
|
},
|
|
136
219
|
{ role: "tool", content: "It's 80 degrees and sunny in SF." },
|
|
@@ -143,7 +226,11 @@ const referenceOutputs = [
|
|
|
143
226
|
{ role: "tool", content: "It's 80 degrees and sunny in San Francisco." },
|
|
144
227
|
];
|
|
145
228
|
|
|
146
|
-
const
|
|
229
|
+
const evaluator = createTrajectoryMatchEvaluator({
|
|
230
|
+
trajectoryMatchMode: "strict",
|
|
231
|
+
})
|
|
232
|
+
|
|
233
|
+
const result = await evaluator({
|
|
147
234
|
outputs,
|
|
148
235
|
referenceOutputs,
|
|
149
236
|
});
|
|
@@ -153,17 +240,21 @@ console.log(result);
|
|
|
153
240
|
|
|
154
241
|
```
|
|
155
242
|
{
|
|
156
|
-
'key': '
|
|
157
|
-
'score':
|
|
243
|
+
'key': 'trajectory_strict_match',
|
|
244
|
+
'score': false,
|
|
158
245
|
}
|
|
159
246
|
```
|
|
160
247
|
|
|
248
|
+
`"strict"` is useful is if you want to ensure that tools are always called in the same order for a given query (e.g. a company policy lookup tool before a tool that requests vacation time for an employee).
|
|
249
|
+
|
|
250
|
+
**Note:** If you would like to configure the way this evaluator checks for tool call equality, see [this section](#checking-tool-call-equality).
|
|
251
|
+
|
|
161
252
|
#### Unordered match
|
|
162
253
|
|
|
163
|
-
The `
|
|
254
|
+
The `"unordered"` `trajectory_match_mode` compares two trajectories and ensures that they contain the same tool calls in any order. This is useful if you want to allow flexibility in how an agent obtains the proper information, but still do care that all information was retrieved.
|
|
164
255
|
|
|
165
256
|
```ts
|
|
166
|
-
import {
|
|
257
|
+
import { createTrajectoryMatchEvaluator } from "agentevals";
|
|
167
258
|
|
|
168
259
|
const outputs = [
|
|
169
260
|
{ role: "user", content: "What is the weather in SF and is there anything fun happening?" },
|
|
@@ -214,7 +305,11 @@ const referenceOutputs = [
|
|
|
214
305
|
{ role: "assistant", content: "In SF, it's 80˚ and sunny, but there is nothing fun happening." },
|
|
215
306
|
];
|
|
216
307
|
|
|
217
|
-
const
|
|
308
|
+
const evaluator = createTrajectoryMatchEvaluator({
|
|
309
|
+
trajectoryMatchMode: "unordered",
|
|
310
|
+
});
|
|
311
|
+
|
|
312
|
+
const result = await evaluator({
|
|
218
313
|
outputs,
|
|
219
314
|
referenceOutputs,
|
|
220
315
|
});
|
|
@@ -229,13 +324,16 @@ console.log(result)
|
|
|
229
324
|
}
|
|
230
325
|
```
|
|
231
326
|
|
|
327
|
+
`"unordered"` is useful is if you want to ensure that specific tools are called at some point in the trajectory, but you don't necessarily need them to be in message order (e.g. the agent called a company policy retrieval tool at an arbitrary point in an interaction before authorizing spend for a pizza party).
|
|
328
|
+
|
|
329
|
+
**Note:** If you would like to configure the way this evaluator checks for tool call equality, see [this section](#checking-tool-call-equality).
|
|
330
|
+
|
|
232
331
|
#### Subset and superset match
|
|
233
332
|
|
|
234
|
-
|
|
333
|
+
The `"subset"` and `"superset"` modes match partial trajectories (ensuring that a trajectory contains a subset/superset of tool calls contained in a reference trajectory).
|
|
235
334
|
|
|
236
335
|
```ts
|
|
237
|
-
import {
|
|
238
|
-
// import { trajectorySuperset } from "agentevals";
|
|
336
|
+
import { createTrajectoryMatchEvaluator } from "agentevals";
|
|
239
337
|
|
|
240
338
|
const outputs = [
|
|
241
339
|
{ role: "user", content: "What is the weather in SF and London?" },
|
|
@@ -246,9 +344,15 @@ const outputs = [
|
|
|
246
344
|
name: "get_weather",
|
|
247
345
|
arguments: JSON.stringify({ city: "SF and London" }),
|
|
248
346
|
}
|
|
347
|
+
}, {
|
|
348
|
+
"function": {
|
|
349
|
+
name: "accuweather_forecast",
|
|
350
|
+
arguments: JSON.stringify({"city": "SF and London"}),
|
|
351
|
+
}
|
|
249
352
|
}],
|
|
250
353
|
},
|
|
251
354
|
{ role: "tool", content: "It's 80 degrees and sunny in SF, and 90 degrees and rainy in London." },
|
|
355
|
+
{ role: "tool", content: "Unknown." },
|
|
252
356
|
{ role: "assistant", content: "The weather in SF is 80 degrees and sunny. In London, it's 90 degrees and rainy."},
|
|
253
357
|
];
|
|
254
358
|
|
|
@@ -260,23 +364,20 @@ const referenceOutputs = [
|
|
|
260
364
|
{
|
|
261
365
|
function: {
|
|
262
366
|
name: "get_weather",
|
|
263
|
-
arguments: JSON.stringify({ city: "
|
|
264
|
-
}
|
|
265
|
-
},
|
|
266
|
-
{
|
|
267
|
-
function: {
|
|
268
|
-
name: "get_weather",
|
|
269
|
-
arguments: JSON.stringify({ city: "London" }),
|
|
367
|
+
arguments: JSON.stringify({ city: "SF and London" }),
|
|
270
368
|
}
|
|
271
369
|
},
|
|
272
370
|
],
|
|
273
371
|
},
|
|
274
|
-
{ role: "tool", content: "It's 80 degrees and sunny in San Francisco." },
|
|
275
|
-
{ role: "tool", content: "It's 90 degrees and rainy in London." },
|
|
372
|
+
{ role: "tool", content: "It's 80 degrees and sunny in San Francisco, and 90 degrees and rainy in London." },
|
|
276
373
|
{ role: "assistant", content: "The weather in SF is 80˚ and sunny. In London, it's 90˚ and rainy." },
|
|
277
374
|
];
|
|
278
375
|
|
|
279
|
-
const
|
|
376
|
+
const evaluator = createTrajectoryMatchEvaluator({
|
|
377
|
+
trajectoryMatchMode: "superset", // or "subset"
|
|
378
|
+
});
|
|
379
|
+
|
|
380
|
+
const result = await evaluator({
|
|
280
381
|
outputs,
|
|
281
382
|
referenceOutputs,
|
|
282
383
|
});
|
|
@@ -286,11 +387,15 @@ console.log(result)
|
|
|
286
387
|
|
|
287
388
|
```
|
|
288
389
|
{
|
|
289
|
-
'key': '
|
|
390
|
+
'key': 'trajectory_superset_match',
|
|
290
391
|
'score': true,
|
|
291
392
|
}
|
|
292
393
|
```
|
|
293
394
|
|
|
395
|
+
`"superset"` is useful if you want to ensure that some key tools were called at some point in the trajectory, but an agent calling extra tools is still acceptable. `"subset"` is the inverse and is useful if you want to ensure that the agent did not call any tools beyond the expected ones.
|
|
396
|
+
|
|
397
|
+
**Note:** If you would like to configure the way this evaluator checks for tool call equality, see [this section](#checking-tool-call-equality).
|
|
398
|
+
|
|
294
399
|
#### Trajectory LLM-as-judge
|
|
295
400
|
|
|
296
401
|
The LLM-as-judge trajectory evaluator that uses an LLM to evaluate the trajectory. Unlike the other trajectory evaluators, it doesn't require a reference trajectory,
|
|
@@ -514,7 +619,7 @@ console.log(res);
|
|
|
514
619
|
}
|
|
515
620
|
```
|
|
516
621
|
|
|
517
|
-
Note that though this evaluator takes the typical `inputs`, `outputs`, and `
|
|
622
|
+
Note that though this evaluator takes the typical `inputs`, `outputs`, and `reference_outputs` parameters, it internally combines `inputs` and `outputs` to form a `thread`. Therefore, if you want to customize the prompt, your prompt should also contain a `thread` input variable:
|
|
518
623
|
|
|
519
624
|
```ts
|
|
520
625
|
const CUSTOM_PROMPT = `You are an expert data labeler.
|
|
@@ -546,18 +651,18 @@ const graphTrajectoryEvaluator = createGraphTrajectoryLLMAsJudge({
|
|
|
546
651
|
model: "openai:o3-mini",
|
|
547
652
|
})
|
|
548
653
|
res = await graphTrajectoryEvaluator(
|
|
549
|
-
inputs
|
|
550
|
-
outputs
|
|
654
|
+
inputs: extractedTrajectory.inputs,
|
|
655
|
+
outputs: extractedTrajectory.outputs,
|
|
551
656
|
)
|
|
552
657
|
```
|
|
553
658
|
|
|
554
|
-
In order to format them properly into the prompt, `
|
|
659
|
+
In order to format them properly into the prompt, `reference_outputs` should be passed in as a `GraphTrajectory` object like `outputs`.
|
|
555
660
|
|
|
556
661
|
Also note that like other LLM-as-judge evaluators, you can pass extra kwargs into the evaluator to format them into the prompt.
|
|
557
662
|
|
|
558
663
|
#### Graph trajectory strict match
|
|
559
664
|
|
|
560
|
-
The `
|
|
665
|
+
The `graph_trajectory_strict_match` evaluator is a simple evaluator that checks if the steps in the provided graph trajectory match the reference trajectory exactly.
|
|
561
666
|
|
|
562
667
|
```ts
|
|
563
668
|
import { tool } from "@langchain/core/tools";
|
|
@@ -626,23 +731,24 @@ console.log(result);
|
|
|
626
731
|
'score': True,
|
|
627
732
|
}
|
|
628
733
|
```
|
|
734
|
+
|
|
629
735
|
## LangSmith Integration
|
|
630
736
|
|
|
631
737
|
For tracking experiments over time, you can log evaluator results to [LangSmith](https://smith.langchain.com/), a platform for building production-grade LLM applications that includes tracing, evaluation, and experimentation tools.
|
|
632
738
|
|
|
633
|
-
LangSmith currently offers two ways to run evals. We'll give a quick example of how to run evals using both.
|
|
739
|
+
LangSmith currently offers two ways to run evals: a [pytest](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest) (Python) or [Vitest/Jest](https://docs.smith.langchain.com/evaluation/how_to_guides/vitest_jest) integration and the `evaluate` function. We'll give a quick example of how to run evals using both.
|
|
634
740
|
|
|
635
741
|
### Pytest or Vitest/Jest
|
|
636
742
|
|
|
637
|
-
First, follow [these instructions](https://docs.smith.langchain.com/evaluation/how_to_guides/
|
|
743
|
+
First, follow [these instructions](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest) to set up LangSmith's pytest runner, or these to set up [Vitest or Jest](https://docs.smith.langchain.com/evaluation/how_to_guides/vitest_jest),
|
|
638
744
|
setting appropriate environment variables:
|
|
639
745
|
|
|
746
|
+
|
|
640
747
|
```bash
|
|
641
748
|
export LANGSMITH_API_KEY="your_langsmith_api_key"
|
|
642
749
|
export LANGSMITH_TRACING="true"
|
|
643
750
|
```
|
|
644
751
|
|
|
645
|
-
|
|
646
752
|
Then, set up a file named `test_trajectory.eval.ts` with the following contents:
|
|
647
753
|
|
|
648
754
|
```ts
|
|
@@ -717,7 +823,6 @@ Now, run the eval with your runner of choice:
|
|
|
717
823
|
vitest run test_trajectory.eval.ts
|
|
718
824
|
```
|
|
719
825
|
|
|
720
|
-
|
|
721
826
|
Feedback from the prebuilt evaluator will be automatically logged in LangSmith as a table of results like this in your terminal:
|
|
722
827
|
|
|
723
828
|

|
package/dist/index.cjs
CHANGED
|
@@ -14,7 +14,7 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
|
14
14
|
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
15
|
};
|
|
16
16
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
-
exports.GRAPH_TRAJECTORY_ACCURACY_PROMPT = exports.createGraphTrajectoryLLMAsJudge = exports.TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = exports.TRAJECTORY_ACCURACY_PROMPT = exports.createTrajectoryLLMAsJudge = exports.trajectoryUnorderedMatch = exports.trajectorySuperset = exports.trajectorySubset = exports.trajectoryStrictMatch = void 0;
|
|
17
|
+
exports.GRAPH_TRAJECTORY_ACCURACY_PROMPT = exports.createGraphTrajectoryLLMAsJudge = exports.TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = exports.TRAJECTORY_ACCURACY_PROMPT = exports.createTrajectoryLLMAsJudge = exports.createTrajectoryMatchEvaluator = exports.trajectoryUnorderedMatch = exports.trajectorySuperset = exports.trajectorySubset = exports.trajectoryStrictMatch = void 0;
|
|
18
18
|
var strict_js_1 = require("./trajectory/strict.cjs");
|
|
19
19
|
Object.defineProperty(exports, "trajectoryStrictMatch", { enumerable: true, get: function () { return strict_js_1.trajectoryStrictMatch; } });
|
|
20
20
|
var subset_js_1 = require("./trajectory/subset.cjs");
|
|
@@ -23,6 +23,8 @@ var superset_js_1 = require("./trajectory/superset.cjs");
|
|
|
23
23
|
Object.defineProperty(exports, "trajectorySuperset", { enumerable: true, get: function () { return superset_js_1.trajectorySuperset; } });
|
|
24
24
|
var unordered_js_1 = require("./trajectory/unordered.cjs");
|
|
25
25
|
Object.defineProperty(exports, "trajectoryUnorderedMatch", { enumerable: true, get: function () { return unordered_js_1.trajectoryUnorderedMatch; } });
|
|
26
|
+
var match_js_1 = require("./trajectory/match.cjs");
|
|
27
|
+
Object.defineProperty(exports, "createTrajectoryMatchEvaluator", { enumerable: true, get: function () { return match_js_1.createTrajectoryMatchEvaluator; } });
|
|
26
28
|
var llm_js_1 = require("./trajectory/llm.cjs");
|
|
27
29
|
Object.defineProperty(exports, "createTrajectoryLLMAsJudge", { enumerable: true, get: function () { return llm_js_1.createTrajectoryLLMAsJudge; } });
|
|
28
30
|
Object.defineProperty(exports, "TRAJECTORY_ACCURACY_PROMPT", { enumerable: true, get: function () { return llm_js_1.TRAJECTORY_ACCURACY_PROMPT; } });
|
package/dist/index.d.ts
CHANGED
|
@@ -2,6 +2,7 @@ export { trajectoryStrictMatch } from "./trajectory/strict.js";
|
|
|
2
2
|
export { trajectorySubset } from "./trajectory/subset.js";
|
|
3
3
|
export { trajectorySuperset } from "./trajectory/superset.js";
|
|
4
4
|
export { trajectoryUnorderedMatch } from "./trajectory/unordered.js";
|
|
5
|
+
export { createTrajectoryMatchEvaluator, type TrajectoryMatchMode, } from "./trajectory/match.js";
|
|
5
6
|
export { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, } from "./trajectory/llm.js";
|
|
6
7
|
export { createGraphTrajectoryLLMAsJudge, GRAPH_TRAJECTORY_ACCURACY_PROMPT, } from "./graph_trajectory/llm.js";
|
|
7
8
|
export * from "./types.js";
|
package/dist/index.js
CHANGED
|
@@ -2,6 +2,7 @@ export { trajectoryStrictMatch } from "./trajectory/strict.js";
|
|
|
2
2
|
export { trajectorySubset } from "./trajectory/subset.js";
|
|
3
3
|
export { trajectorySuperset } from "./trajectory/superset.js";
|
|
4
4
|
export { trajectoryUnorderedMatch } from "./trajectory/unordered.js";
|
|
5
|
+
export { createTrajectoryMatchEvaluator, } from "./trajectory/match.js";
|
|
5
6
|
export { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, } from "./trajectory/llm.js";
|
|
6
7
|
export { createGraphTrajectoryLLMAsJudge, GRAPH_TRAJECTORY_ACCURACY_PROMPT, } from "./graph_trajectory/llm.js";
|
|
7
8
|
export * from "./types.js";
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.createTrajectoryMatchEvaluator = void 0;
|
|
4
|
+
const utils_js_1 = require("../utils.cjs");
|
|
5
|
+
const strict_js_1 = require("./strict.cjs");
|
|
6
|
+
const unordered_js_1 = require("./unordered.cjs");
|
|
7
|
+
const subset_js_1 = require("./subset.cjs");
|
|
8
|
+
const superset_js_1 = require("./superset.cjs");
|
|
9
|
+
/**
|
|
10
|
+
* Creates an evaluator that compares trajectories between model outputs and reference outputs.
|
|
11
|
+
*
|
|
12
|
+
* @param options - The configuration options
|
|
13
|
+
* @param options.trajectoryMatchMode - The mode for matching trajectories:
|
|
14
|
+
* - `"strict"`: Requires exact match in order and content
|
|
15
|
+
* - `"unordered"`: Allows matching in any order
|
|
16
|
+
* - `"subset"`: Accepts if output trajectory is a subset of reference
|
|
17
|
+
* - `"superset"`: Accepts if output trajectory is a superset of reference
|
|
18
|
+
* @param options.toolArgsMatchMode - Mode for matching tool arguments ("exact" by default, can be "ignore")
|
|
19
|
+
* @param options.toolArgsMatchOverrides - Object containing custom overrides for tool argument matching.
|
|
20
|
+
* Each key should be a tool name, and each value should be either a match mode or a matcher function.
|
|
21
|
+
* Matchers should be a function that takes two sets of tool call args and returns whether they are equal.
|
|
22
|
+
*
|
|
23
|
+
* @returns An async function that evaluates trajectory matches between outputs and references.
|
|
24
|
+
* The returned evaluator accepts:
|
|
25
|
+
* - outputs: List of messages or dict representing the model output trajectory
|
|
26
|
+
* - referenceOutputs: List of messages or dict representing the reference trajectory
|
|
27
|
+
* - Additional arguments passed to the underlying evaluator
|
|
28
|
+
*
|
|
29
|
+
* @example
|
|
30
|
+
* ```typescript
|
|
31
|
+
* const matcher = (
|
|
32
|
+
* outputToolCallArgs: Record<string, any>,
|
|
33
|
+
* referenceToolCallArgs: Record<string, any>
|
|
34
|
+
* ): boolean => {
|
|
35
|
+
* const outputArgs = (outputToolCallArgs.query ?? "").toLowerCase();
|
|
36
|
+
* const referenceArgs = (referenceToolCallArgs.query ?? "").toLowerCase();
|
|
37
|
+
* return outputArgs === referenceArgs;
|
|
38
|
+
* };
|
|
39
|
+
*
|
|
40
|
+
* const evaluator = createAsyncTrajectoryMatchEvaluator({
|
|
41
|
+
* trajectoryMatchMode: "strict",
|
|
42
|
+
* toolArgsMatchMode: "exact",
|
|
43
|
+
* toolArgsMatchOverrides: {
|
|
44
|
+
* myToolName: matcher,
|
|
45
|
+
* },
|
|
46
|
+
* });
|
|
47
|
+
*
|
|
48
|
+
* const result = await evaluator({
|
|
49
|
+
* outputs: [...],
|
|
50
|
+
* referenceOutputs: [...],
|
|
51
|
+
* });
|
|
52
|
+
* ```
|
|
53
|
+
*/
|
|
54
|
+
function createTrajectoryMatchEvaluator({ trajectoryMatchMode = "strict", toolArgsMatchMode = "exact", toolArgsMatchOverrides, }) {
|
|
55
|
+
let scorer;
|
|
56
|
+
switch (trajectoryMatchMode) {
|
|
57
|
+
case "strict":
|
|
58
|
+
scorer = strict_js_1._scorer;
|
|
59
|
+
break;
|
|
60
|
+
case "unordered":
|
|
61
|
+
scorer = unordered_js_1._scorer;
|
|
62
|
+
break;
|
|
63
|
+
case "subset":
|
|
64
|
+
scorer = subset_js_1._scorer;
|
|
65
|
+
break;
|
|
66
|
+
case "superset":
|
|
67
|
+
scorer = superset_js_1._scorer;
|
|
68
|
+
break;
|
|
69
|
+
default:
|
|
70
|
+
throw new Error(`Invalid trajectory match type: ${trajectoryMatchMode}`);
|
|
71
|
+
}
|
|
72
|
+
return async function _wrappedEvaluator({ outputs, referenceOutputs, ...extra }) {
|
|
73
|
+
const normalizedOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
|
|
74
|
+
const normalizedReferenceOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs);
|
|
75
|
+
return (0, utils_js_1._runEvaluator)(`trajectory_${trajectoryMatchMode}_match`, scorer, `trajectory_${trajectoryMatchMode}_match`, {
|
|
76
|
+
outputs: normalizedOutputs,
|
|
77
|
+
referenceOutputs: normalizedReferenceOutputs,
|
|
78
|
+
toolArgsMatchMode,
|
|
79
|
+
toolArgsMatchOverrides,
|
|
80
|
+
...extra,
|
|
81
|
+
});
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
exports.createTrajectoryMatchEvaluator = createTrajectoryMatchEvaluator;
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { BaseMessage } from "@langchain/core/messages";
|
|
2
|
+
import { ChatCompletionMessage, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
|
|
3
|
+
export type TrajectoryMatchMode = "strict" | "unordered" | "subset" | "superset";
|
|
4
|
+
/**
|
|
5
|
+
* Creates an evaluator that compares trajectories between model outputs and reference outputs.
|
|
6
|
+
*
|
|
7
|
+
* @param options - The configuration options
|
|
8
|
+
* @param options.trajectoryMatchMode - The mode for matching trajectories:
|
|
9
|
+
* - `"strict"`: Requires exact match in order and content
|
|
10
|
+
* - `"unordered"`: Allows matching in any order
|
|
11
|
+
* - `"subset"`: Accepts if output trajectory is a subset of reference
|
|
12
|
+
* - `"superset"`: Accepts if output trajectory is a superset of reference
|
|
13
|
+
* @param options.toolArgsMatchMode - Mode for matching tool arguments ("exact" by default, can be "ignore")
|
|
14
|
+
* @param options.toolArgsMatchOverrides - Object containing custom overrides for tool argument matching.
|
|
15
|
+
* Each key should be a tool name, and each value should be either a match mode or a matcher function.
|
|
16
|
+
* Matchers should be a function that takes two sets of tool call args and returns whether they are equal.
|
|
17
|
+
*
|
|
18
|
+
* @returns An async function that evaluates trajectory matches between outputs and references.
|
|
19
|
+
* The returned evaluator accepts:
|
|
20
|
+
* - outputs: List of messages or dict representing the model output trajectory
|
|
21
|
+
* - referenceOutputs: List of messages or dict representing the reference trajectory
|
|
22
|
+
* - Additional arguments passed to the underlying evaluator
|
|
23
|
+
*
|
|
24
|
+
* @example
|
|
25
|
+
* ```typescript
|
|
26
|
+
* const matcher = (
|
|
27
|
+
* outputToolCallArgs: Record<string, any>,
|
|
28
|
+
* referenceToolCallArgs: Record<string, any>
|
|
29
|
+
* ): boolean => {
|
|
30
|
+
* const outputArgs = (outputToolCallArgs.query ?? "").toLowerCase();
|
|
31
|
+
* const referenceArgs = (referenceToolCallArgs.query ?? "").toLowerCase();
|
|
32
|
+
* return outputArgs === referenceArgs;
|
|
33
|
+
* };
|
|
34
|
+
*
|
|
35
|
+
* const evaluator = createAsyncTrajectoryMatchEvaluator({
|
|
36
|
+
* trajectoryMatchMode: "strict",
|
|
37
|
+
* toolArgsMatchMode: "exact",
|
|
38
|
+
* toolArgsMatchOverrides: {
|
|
39
|
+
* myToolName: matcher,
|
|
40
|
+
* },
|
|
41
|
+
* });
|
|
42
|
+
*
|
|
43
|
+
* const result = await evaluator({
|
|
44
|
+
* outputs: [...],
|
|
45
|
+
* referenceOutputs: [...],
|
|
46
|
+
* });
|
|
47
|
+
* ```
|
|
48
|
+
*/
|
|
49
|
+
export declare function createTrajectoryMatchEvaluator({ trajectoryMatchMode, toolArgsMatchMode, toolArgsMatchOverrides, }: {
|
|
50
|
+
trajectoryMatchMode?: TrajectoryMatchMode;
|
|
51
|
+
toolArgsMatchMode?: ToolArgsMatchMode;
|
|
52
|
+
toolArgsMatchOverrides?: ToolArgsMatchOverrides;
|
|
53
|
+
}): ({ outputs, referenceOutputs, ...extra }: {
|
|
54
|
+
[key: string]: unknown;
|
|
55
|
+
outputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
56
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
57
|
+
};
|
|
58
|
+
referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
59
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
60
|
+
};
|
|
61
|
+
}) => Promise<import("langsmith/vitest").SimpleEvaluationResult>;
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
|
|
2
|
+
import { _scorer as trajectoryStrictScorer } from "./strict.js";
|
|
3
|
+
import { _scorer as trajectoryUnorderedScorer } from "./unordered.js";
|
|
4
|
+
import { _scorer as trajectorySubsetScorer } from "./subset.js";
|
|
5
|
+
import { _scorer as trajectorySuperstScorer } from "./superset.js";
|
|
6
|
+
/**
|
|
7
|
+
* Creates an evaluator that compares trajectories between model outputs and reference outputs.
|
|
8
|
+
*
|
|
9
|
+
* @param options - The configuration options
|
|
10
|
+
* @param options.trajectoryMatchMode - The mode for matching trajectories:
|
|
11
|
+
* - `"strict"`: Requires exact match in order and content
|
|
12
|
+
* - `"unordered"`: Allows matching in any order
|
|
13
|
+
* - `"subset"`: Accepts if output trajectory is a subset of reference
|
|
14
|
+
* - `"superset"`: Accepts if output trajectory is a superset of reference
|
|
15
|
+
* @param options.toolArgsMatchMode - Mode for matching tool arguments ("exact" by default, can be "ignore")
|
|
16
|
+
* @param options.toolArgsMatchOverrides - Object containing custom overrides for tool argument matching.
|
|
17
|
+
* Each key should be a tool name, and each value should be either a match mode or a matcher function.
|
|
18
|
+
* Matchers should be a function that takes two sets of tool call args and returns whether they are equal.
|
|
19
|
+
*
|
|
20
|
+
* @returns An async function that evaluates trajectory matches between outputs and references.
|
|
21
|
+
* The returned evaluator accepts:
|
|
22
|
+
* - outputs: List of messages or dict representing the model output trajectory
|
|
23
|
+
* - referenceOutputs: List of messages or dict representing the reference trajectory
|
|
24
|
+
* - Additional arguments passed to the underlying evaluator
|
|
25
|
+
*
|
|
26
|
+
* @example
|
|
27
|
+
* ```typescript
|
|
28
|
+
* const matcher = (
|
|
29
|
+
* outputToolCallArgs: Record<string, any>,
|
|
30
|
+
* referenceToolCallArgs: Record<string, any>
|
|
31
|
+
* ): boolean => {
|
|
32
|
+
* const outputArgs = (outputToolCallArgs.query ?? "").toLowerCase();
|
|
33
|
+
* const referenceArgs = (referenceToolCallArgs.query ?? "").toLowerCase();
|
|
34
|
+
* return outputArgs === referenceArgs;
|
|
35
|
+
* };
|
|
36
|
+
*
|
|
37
|
+
* const evaluator = createAsyncTrajectoryMatchEvaluator({
|
|
38
|
+
* trajectoryMatchMode: "strict",
|
|
39
|
+
* toolArgsMatchMode: "exact",
|
|
40
|
+
* toolArgsMatchOverrides: {
|
|
41
|
+
* myToolName: matcher,
|
|
42
|
+
* },
|
|
43
|
+
* });
|
|
44
|
+
*
|
|
45
|
+
* const result = await evaluator({
|
|
46
|
+
* outputs: [...],
|
|
47
|
+
* referenceOutputs: [...],
|
|
48
|
+
* });
|
|
49
|
+
* ```
|
|
50
|
+
*/
|
|
51
|
+
export function createTrajectoryMatchEvaluator({ trajectoryMatchMode = "strict", toolArgsMatchMode = "exact", toolArgsMatchOverrides, }) {
|
|
52
|
+
let scorer;
|
|
53
|
+
switch (trajectoryMatchMode) {
|
|
54
|
+
case "strict":
|
|
55
|
+
scorer = trajectoryStrictScorer;
|
|
56
|
+
break;
|
|
57
|
+
case "unordered":
|
|
58
|
+
scorer = trajectoryUnorderedScorer;
|
|
59
|
+
break;
|
|
60
|
+
case "subset":
|
|
61
|
+
scorer = trajectorySubsetScorer;
|
|
62
|
+
break;
|
|
63
|
+
case "superset":
|
|
64
|
+
scorer = trajectorySuperstScorer;
|
|
65
|
+
break;
|
|
66
|
+
default:
|
|
67
|
+
throw new Error(`Invalid trajectory match type: ${trajectoryMatchMode}`);
|
|
68
|
+
}
|
|
69
|
+
return async function _wrappedEvaluator({ outputs, referenceOutputs, ...extra }) {
|
|
70
|
+
const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs);
|
|
71
|
+
const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList(referenceOutputs);
|
|
72
|
+
return _runEvaluator(`trajectory_${trajectoryMatchMode}_match`, scorer, `trajectory_${trajectoryMatchMode}_match`, {
|
|
73
|
+
outputs: normalizedOutputs,
|
|
74
|
+
referenceOutputs: normalizedReferenceOutputs,
|
|
75
|
+
toolArgsMatchMode,
|
|
76
|
+
toolArgsMatchOverrides,
|
|
77
|
+
...extra,
|
|
78
|
+
});
|
|
79
|
+
};
|
|
80
|
+
}
|