agentevals 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -28,6 +28,7 @@ Once you've done this, you can run your first trajectory evaluator. We represent
28
28
  ```ts
29
29
  import {
30
30
  createTrajectoryLLMAsJudge,
31
+ type FlexibleChatCompletionMessage,
31
32
  TRAJECTORY_ACCURACY_PROMPT,
32
33
  } from "agentevals";
33
34
 
@@ -55,7 +56,7 @@ const outputs = [
55
56
  role: "assistant",
56
57
  content: "The weather in SF is 80 degrees and sunny.",
57
58
  },
58
- ];
59
+ ] satisfies FlexibleChatCompletionMessage[];
59
60
 
60
61
  const evalResult = await trajectoryEvaluator({
61
62
  outputs,
@@ -130,7 +131,10 @@ The `"strict"` `trajectory_match_mode` compares two trajectories and ensures tha
130
131
  in the same order with the same tool calls. Note that it does allow for differences in message content:
131
132
 
132
133
  ```ts
133
- import { createTrajectoryMatchEvaluator } from "agentevals";
134
+ import {
135
+ createTrajectoryMatchEvaluator,
136
+ type FlexibleChatCompletionMessage,
137
+ } from "agentevals";
134
138
 
135
139
  const outputs = [
136
140
  { role: "user", content: "What is the weather in SF?" },
@@ -151,7 +155,7 @@ const outputs = [
151
155
  },
152
156
  { role: "tool", content: "It's 80 degrees and sunny in SF." },
153
157
  { role: "assistant", content: "The weather in SF is 80 degrees and sunny." },
154
- ];
158
+ ] satisfies FlexibleChatCompletionMessage[];
155
159
 
156
160
  const referenceOutputs = [
157
161
  { role: "user", content: "What is the weather in San Francisco?" },
@@ -166,7 +170,7 @@ const referenceOutputs = [
166
170
  }]
167
171
  },
168
172
  { role: "tool", content: "It's 80 degrees and sunny in San Francisco." },
169
- ];
173
+ ] satisfies FlexibleChatCompletionMessage[];
170
174
 
171
175
  const evaluator = createTrajectoryMatchEvaluator({
172
176
  trajectoryMatchMode: "strict",
@@ -196,7 +200,10 @@ console.log(result);
196
200
  The `"unordered"` `trajectory_match_mode` compares two trajectories and ensures that they contain the same tool calls in any order. This is useful if you want to allow flexibility in how an agent obtains the proper information, but still do care that all information was retrieved.
197
201
 
198
202
  ```ts
199
- import { createTrajectoryMatchEvaluator } from "agentevals";
203
+ import {
204
+ createTrajectoryMatchEvaluator,
205
+ type FlexibleChatCompletionMessage,
206
+ } from "agentevals";
200
207
 
201
208
  const outputs = [
202
209
  { role: "user", content: "What is the weather in SF and is there anything fun happening?" },
@@ -223,7 +230,7 @@ const outputs = [
223
230
  },
224
231
  { role: "tool", content: "Nothing fun is happening, you should stay indoors and read!" },
225
232
  { role: "assistant", content: "The weather in SF is 80 degrees and sunny, but there is nothing fun happening." },
226
- ];
233
+ ] satisifes FlexibleChatCompletionMessage[];
227
234
 
228
235
  const referenceOutputs = [
229
236
  { role: "user", content: "What is the weather in SF and is there anything fun happening?" },
@@ -248,7 +255,7 @@ const referenceOutputs = [
248
255
  { role: "tool", content: "Nothing fun is happening, you should stay indoors and read!" },
249
256
  { role: "tool", content: "It's 80 degrees and sunny in SF." },
250
257
  { role: "assistant", content: "In SF, it's 80˚ and sunny, but there is nothing fun happening." },
251
- ];
258
+ ] satisfies FlexibleChatCompletionMessage[];
252
259
 
253
260
  const evaluator = createTrajectoryMatchEvaluator({
254
261
  trajectoryMatchMode: "unordered",
@@ -278,7 +285,10 @@ console.log(result)
278
285
  The `"subset"` and `"superset"` modes match partial trajectories (ensuring that a trajectory contains a subset/superset of tool calls contained in a reference trajectory).
279
286
 
280
287
  ```ts
281
- import { createTrajectoryMatchEvaluator } from "agentevals";
288
+ import {
289
+ createTrajectoryMatchEvaluator,
290
+ type FlexibleChatCompletionMessage
291
+ } from "agentevals";
282
292
 
283
293
  const outputs = [
284
294
  { role: "user", content: "What is the weather in SF and London?" },
@@ -300,7 +310,7 @@ const outputs = [
300
310
  { role: "tool", content: "It's 80 degrees and sunny in SF, and 90 degrees and rainy in London." },
301
311
  { role: "tool", content: "Unknown." },
302
312
  { role: "assistant", content: "The weather in SF is 80 degrees and sunny. In London, it's 90 degrees and rainy."},
303
- ];
313
+ ] satisfies FlexibleChatCompletionMessage[];
304
314
 
305
315
  const referenceOutputs = [
306
316
  { role: "user", content: "What is the weather in SF and London?" },
@@ -318,7 +328,7 @@ const referenceOutputs = [
318
328
  },
319
329
  { role: "tool", content: "It's 80 degrees and sunny in San Francisco, and 90 degrees and rainy in London." },
320
330
  { role: "assistant", content: "The weather in SF is 80˚ and sunny. In London, it's 90˚ and rainy." },
321
- ];
331
+ ] satisfies FlexibleChatCompletionMessage[];
322
332
 
323
333
  const evaluator = createTrajectoryMatchEvaluator({
324
334
  trajectoryMatchMode: "superset", // or "subset"
@@ -364,7 +374,10 @@ ToolArgsMatchOverrides = dict[str, Union[ToolArgsMatchMode, list[str], Callable
364
374
  Here's an example that allows case insensitivity for the arguments to a tool named `get_weather`:
365
375
 
366
376
  ```ts
367
- import { createTrajectoryMatchEvaluator } from "agentevals";
377
+ import {
378
+ createTrajectoryMatchEvaluator,
379
+ type FlexibleChatCompletionMessage,
380
+ } from "agentevals";
368
381
 
369
382
  const outputs = [
370
383
  { role: "user", content: "What is the weather in SF?" },
@@ -380,7 +393,7 @@ const outputs = [
380
393
  },
381
394
  { role: "tool", content: "It's 80 degrees and sunny in SF." },
382
395
  { role: "assistant", content: "The weather in SF is 80 degrees and sunny." },
383
- ];
396
+ ] satisfies FlexibleChatCompletionMessage[];
384
397
 
385
398
  const referenceOutputs = [
386
399
  { role: "user", content: "What is the weather in San Francisco?" },
@@ -395,7 +408,7 @@ const referenceOutputs = [
395
408
  }]
396
409
  },
397
410
  { role: "tool", content: "It's 80 degrees and sunny in San Francisco." },
398
- ];
411
+ ] satisfies FlexibleChatCompletionMessage[];
399
412
 
400
413
  const evaluator = createTrajectoryMatchEvaluator({
401
414
  trajectoryMatchMode: "strict",
@@ -434,6 +447,7 @@ The LLM-as-judge trajectory evaluator that uses an LLM to evaluate the trajector
434
447
  import {
435
448
  createTrajectoryLLMAsJudge,
436
449
  TRAJECTORY_ACCURACY_PROMPT,
450
+ type FlexibleChatCompletionMessage,
437
451
  } from "agentevals";
438
452
 
439
453
  const evaluator = createTrajectoryLLMAsJudge({
@@ -457,7 +471,7 @@ const outputs = [
457
471
  },
458
472
  {role: "tool", content: "It's 80 degrees and sunny in SF."},
459
473
  {role: "assistant", content: "The weather in SF is 80 degrees and sunny."},
460
- ];
474
+ ] satisfies FlexibleChatCompletionMessage[];
461
475
 
462
476
  const result = await evaluator({ outputs });
463
477
 
@@ -477,7 +491,8 @@ If you have a reference trajectory, you can add an extra variable to your prompt
477
491
  ```ts
478
492
  import {
479
493
  createTrajectoryLLMAsJudge,
480
- TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE
494
+ TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE,
495
+ type FlexibleChatCompletionMessage,
481
496
  } from "agentevals";
482
497
 
483
498
  const evaluator = createTrajectoryLLMAsJudge({
@@ -501,7 +516,8 @@ const outputs = [
501
516
  },
502
517
  {role: "tool", content: "It's 80 degrees and sunny in SF."},
503
518
  {role: "assistant", content: "The weather in SF is 80 degrees and sunny."},
504
- ]
519
+ ] satisfies FlexibleChatCompletionMessage[];
520
+
505
521
  const referenceOutputs = [
506
522
  {role: "user", content: "What is the weather in SF?"},
507
523
  {
@@ -518,7 +534,7 @@ const referenceOutputs = [
518
534
  },
519
535
  {role: "tool", content: "It's 80 degrees and sunny in San Francisco."},
520
536
  {role: "assistant", content: "The weather in SF is 80˚ and sunny."},
521
- ]
537
+ ] satisfies FlexibleChatCompletionMessage[];
522
538
 
523
539
  const result = await evaluator({
524
540
  outputs,
@@ -677,10 +693,10 @@ const graphTrajectoryEvaluator = createGraphTrajectoryLLMAsJudge({
677
693
  model: "openai:o3-mini",
678
694
  })
679
695
 
680
- const res = await graphTrajectoryEvaluator(
681
- inputs=extractedTrajectory.inputs,
682
- outputs=extractedTrajectory.outputs,
683
- )
696
+ const res = await graphTrajectoryEvaluator({
697
+ inputs: extractedTrajectory.inputs,
698
+ outputs: extractedTrajectory.outputs,
699
+ });
684
700
 
685
701
  console.log(res);
686
702
  ```
@@ -724,10 +740,10 @@ const graphTrajectoryEvaluator = createGraphTrajectoryLLMAsJudge({
724
740
  prompt: CUSTOM_PROMPT,
725
741
  model: "openai:o3-mini",
726
742
  })
727
- res = await graphTrajectoryEvaluator(
743
+ const res = await graphTrajectoryEvaluator({
728
744
  inputs: extractedTrajectory.inputs,
729
745
  outputs: extractedTrajectory.outputs,
730
- )
746
+ });
731
747
  ```
732
748
 
733
749
  In order to format them properly into the prompt, `reference_outputs` should be passed in as a `GraphTrajectory` object like `outputs`.
@@ -27,4 +27,4 @@ export declare const createGraphTrajectoryLLMAsJudge: ({ prompt, model, feedback
27
27
  };
28
28
  outputs: GraphTrajectory;
29
29
  referenceOutputs?: GraphTrajectory | undefined;
30
- }) => Promise<import("langsmith/vitest").SimpleEvaluationResult>;
30
+ }) => Promise<import("../types.js").EvaluatorResult>;
@@ -11,4 +11,4 @@ import { GraphTrajectory } from "../types.js";
11
11
  export declare const graphTrajectoryStrictMatch: ({ outputs, referenceOutputs, }: {
12
12
  outputs: GraphTrajectory;
13
13
  referenceOutputs: GraphTrajectory;
14
- }) => Promise<import("langsmith/vitest").SimpleEvaluationResult>;
14
+ }) => Promise<import("../types.js").EvaluatorResult>;
@@ -56,7 +56,14 @@ const extractLangGraphTrajectoryFromSnapshots = (snapshots) => {
56
56
  }
57
57
  if (isAccumulatingSteps) {
58
58
  if (snapshot.metadata != null && snapshot.metadata.source === "input") {
59
- inputs.push(snapshot.metadata.writes);
59
+ if ("writes" in snapshot.metadata &&
60
+ snapshot.metadata.writes != null &&
61
+ typeof snapshot.metadata.writes === "object") {
62
+ inputs.push(snapshot.metadata.writes);
63
+ }
64
+ else {
65
+ inputs.push(...snapshot.tasks.map((task) => ({ [task.name]: task.result })));
66
+ }
60
67
  }
61
68
  else if (i + 1 < snapshots.length &&
62
69
  snapshots[i + 1].tasks?.find((task) => task.interrupts?.length > 0)) {
@@ -2,11 +2,11 @@ import type { StateSnapshot, Pregel } from "@langchain/langgraph/web";
2
2
  import type { RunnableConfig } from "@langchain/core/runnables";
3
3
  import type { GraphTrajectory } from "../types.js";
4
4
  export declare const extractLangGraphTrajectoryFromSnapshots: (snapshots: StateSnapshot[]) => {
5
- inputs: (string | Record<string, unknown> | null)[];
5
+ inputs: (string | Record<string, unknown>)[];
6
6
  outputs: GraphTrajectory;
7
7
  };
8
8
  export declare const _getLangGraphStateHistoryRecursive: (graph: Pregel<any, any>, config: RunnableConfig) => Promise<StateSnapshot[]>;
9
9
  export declare const extractLangGraphTrajectoryFromThread: (graph: Pregel<any, any>, config: RunnableConfig) => Promise<{
10
- inputs: (string | Record<string, unknown> | null)[];
10
+ inputs: (string | Record<string, unknown>)[];
11
11
  outputs: GraphTrajectory;
12
12
  }>;
@@ -53,7 +53,14 @@ export const extractLangGraphTrajectoryFromSnapshots = (snapshots) => {
53
53
  }
54
54
  if (isAccumulatingSteps) {
55
55
  if (snapshot.metadata != null && snapshot.metadata.source === "input") {
56
- inputs.push(snapshot.metadata.writes);
56
+ if ("writes" in snapshot.metadata &&
57
+ snapshot.metadata.writes != null &&
58
+ typeof snapshot.metadata.writes === "object") {
59
+ inputs.push(snapshot.metadata.writes);
60
+ }
61
+ else {
62
+ inputs.push(...snapshot.tasks.map((task) => ({ [task.name]: task.result })));
63
+ }
57
64
  }
58
65
  else if (i + 1 < snapshots.length &&
59
66
  snapshots[i + 1].tasks?.find((task) => task.interrupts?.length > 0)) {
@@ -1,5 +1,5 @@
1
1
  import { BaseMessage } from "@langchain/core/messages";
2
- import { ChatCompletionMessage, EvaluatorResult, TrajectoryLLMAsJudgeParams } from "../types.js";
2
+ import { ChatCompletionMessage, FlexibleChatCompletionMessage, EvaluatorResult, TrajectoryLLMAsJudgeParams } from "../types.js";
3
3
  export declare const TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n An accurate trajectory:\n - Makes logical sense between steps\n - Shows clear progression\n - Is relatively efficient, though it does not need to be perfectly efficient\n - Is semantically equivalent to the provided reference trajectory\n</Rubric>\n\nBased on the following reference trajectory:\n\n<reference_trajectory>\n{reference_outputs}\n</reference_trajectory>\n\nGrade this actual trajectory:\n\n<trajectory>\n{outputs}\n</trajectory>\n";
4
4
  export declare const TRAJECTORY_ACCURACY_PROMPT = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n An accurate trajectory:\n - Makes logical sense between steps\n - Shows clear progression\n - Is relatively efficient, though it does not need to be perfectly efficient\n</Rubric>\n\nFirst, try to understand the goal of the trajectory by looking at the input\n(if the input is not present try to infer it from the content of the first message),\nas well as the output of the final message. Once you understand the goal, grade the trajectory\nas it relates to achieving that goal.\n\nGrade the following trajectory:\n\n<trajectory>\n{outputs}\n</trajectory>";
5
5
  /**
@@ -25,10 +25,10 @@ export declare const TRAJECTORY_ACCURACY_PROMPT = "You are an expert data labele
25
25
  */
26
26
  export declare const createTrajectoryLLMAsJudge: ({ prompt, feedbackKey, model, system, judge, continuous, choices, useReasoning, fewShotExamples, }: TrajectoryLLMAsJudgeParams) => ({ inputs, outputs, referenceOutputs, ...extra }: {
27
27
  [key: string]: unknown;
28
- outputs: ChatCompletionMessage[] | BaseMessage[] | {
29
- messages: (BaseMessage | ChatCompletionMessage)[];
28
+ outputs: ChatCompletionMessage[] | FlexibleChatCompletionMessage[] | BaseMessage[] | {
29
+ messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
30
30
  };
31
- referenceOutputs?: BaseMessage[] | ChatCompletionMessage[] | {
32
- messages: (BaseMessage | ChatCompletionMessage)[];
31
+ referenceOutputs?: ChatCompletionMessage[] | BaseMessage[] | FlexibleChatCompletionMessage[] | {
32
+ messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
33
33
  } | undefined;
34
34
  }) => Promise<EvaluatorResult>;
@@ -1,5 +1,5 @@
1
1
  import { BaseMessage } from "@langchain/core/messages";
2
- import { ChatCompletionMessage, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
2
+ import { ChatCompletionMessage, FlexibleChatCompletionMessage, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
3
3
  export type TrajectoryMatchMode = "strict" | "unordered" | "subset" | "superset";
4
4
  /**
5
5
  * Creates an evaluator that compares trajectories between model outputs and reference outputs.
@@ -52,10 +52,10 @@ export declare function createTrajectoryMatchEvaluator({ trajectoryMatchMode, to
52
52
  toolArgsMatchOverrides?: ToolArgsMatchOverrides;
53
53
  }): ({ outputs, referenceOutputs, ...extra }: {
54
54
  [key: string]: unknown;
55
- outputs: ChatCompletionMessage[] | BaseMessage[] | {
56
- messages: (BaseMessage | ChatCompletionMessage)[];
55
+ outputs: ChatCompletionMessage[] | FlexibleChatCompletionMessage[] | BaseMessage[] | {
56
+ messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
57
57
  };
58
- referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
59
- messages: (BaseMessage | ChatCompletionMessage)[];
58
+ referenceOutputs: ChatCompletionMessage[] | FlexibleChatCompletionMessage[] | BaseMessage[] | {
59
+ messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
60
60
  };
61
- }) => Promise<import("langsmith/vitest").SimpleEvaluationResult>;
61
+ }) => Promise<import("../types.js").EvaluatorResult>;
@@ -5,8 +5,8 @@ const utils_js_1 = require("../utils.cjs");
5
5
  const utils_js_2 = require("./utils.cjs");
6
6
  async function _scorer(params) {
7
7
  const { outputs, referenceOutputs, toolArgsMatchMode, toolArgsMatchOverrides, } = params;
8
- const normalizedOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
9
- const normalizedReferenceOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs);
8
+ const normalizedOutputs = outputs;
9
+ const normalizedReferenceOutputs = referenceOutputs;
10
10
  if (!normalizedOutputs || !normalizedReferenceOutputs) {
11
11
  throw new Error("Strict trajectory match requires both outputs and reference_outputs");
12
12
  }
@@ -66,8 +66,11 @@ exports._scorer = _scorer;
66
66
  * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
67
67
  */
68
68
  async function trajectoryStrictMatch(params) {
69
+ const normalizedOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(params.outputs);
70
+ const normalizedReferenceOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(params.referenceOutputs);
69
71
  return (0, utils_js_1._runEvaluator)("trajectory_strict_match", _scorer, "trajectory_strict_match", {
70
- ...params,
72
+ outputs: normalizedOutputs,
73
+ referenceOutputs: normalizedReferenceOutputs,
71
74
  toolArgsMatchMode: params.toolCallArgsExactMatch ? "exact" : "ignore",
72
75
  });
73
76
  }
@@ -1,12 +1,8 @@
1
1
  import { BaseMessage } from "@langchain/core/messages";
2
- import { ChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
2
+ import { ChatCompletionMessage, FlexibleChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
3
3
  export declare function _scorer(params: {
4
- outputs: ChatCompletionMessage[] | BaseMessage[] | {
5
- messages: (BaseMessage | ChatCompletionMessage)[];
6
- };
7
- referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
8
- messages: (BaseMessage | ChatCompletionMessage)[];
9
- };
4
+ outputs: ChatCompletionMessage[];
5
+ referenceOutputs: ChatCompletionMessage[];
10
6
  toolArgsMatchMode: ToolArgsMatchMode;
11
7
  toolArgsMatchOverrides?: ToolArgsMatchOverrides;
12
8
  }): Promise<boolean>;
@@ -23,11 +19,11 @@ export declare function _scorer(params: {
23
19
  * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
24
20
  */
25
21
  export declare function trajectoryStrictMatch(params: {
26
- outputs: ChatCompletionMessage[] | BaseMessage[] | {
27
- messages: (BaseMessage | ChatCompletionMessage)[];
22
+ outputs: ChatCompletionMessage[] | FlexibleChatCompletionMessage[] | BaseMessage[] | {
23
+ messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
28
24
  };
29
- referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
30
- messages: (BaseMessage | ChatCompletionMessage)[];
25
+ referenceOutputs: ChatCompletionMessage[] | FlexibleChatCompletionMessage[] | BaseMessage[] | {
26
+ messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
31
27
  };
32
28
  toolCallArgsExactMatch: boolean;
33
29
  }): Promise<EvaluatorResult>;
@@ -2,8 +2,8 @@ import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
2
2
  import { _getMatcherForToolName } from "./utils.js";
3
3
  export async function _scorer(params) {
4
4
  const { outputs, referenceOutputs, toolArgsMatchMode, toolArgsMatchOverrides, } = params;
5
- const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs);
6
- const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList(referenceOutputs);
5
+ const normalizedOutputs = outputs;
6
+ const normalizedReferenceOutputs = referenceOutputs;
7
7
  if (!normalizedOutputs || !normalizedReferenceOutputs) {
8
8
  throw new Error("Strict trajectory match requires both outputs and reference_outputs");
9
9
  }
@@ -62,8 +62,11 @@ export async function _scorer(params) {
62
62
  * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
63
63
  */
64
64
  export async function trajectoryStrictMatch(params) {
65
+ const normalizedOutputs = _normalizeToOpenAIMessagesList(params.outputs);
66
+ const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList(params.referenceOutputs);
65
67
  return _runEvaluator("trajectory_strict_match", _scorer, "trajectory_strict_match", {
66
- ...params,
68
+ outputs: normalizedOutputs,
69
+ referenceOutputs: normalizedReferenceOutputs,
67
70
  toolArgsMatchMode: params.toolCallArgsExactMatch ? "exact" : "ignore",
68
71
  });
69
72
  }
@@ -1,5 +1,5 @@
1
1
  import { BaseMessage } from "@langchain/core/messages";
2
- import { ChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
2
+ import { ChatCompletionMessage, FlexibleChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
3
3
  export declare const _scorer: (params: {
4
4
  outputs: ChatCompletionMessage[];
5
5
  referenceOutputs: ChatCompletionMessage[];
@@ -21,10 +21,10 @@ export declare const _scorer: (params: {
21
21
  * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
22
22
  */
23
23
  export declare function trajectorySubset(params: {
24
- outputs: ChatCompletionMessage[] | BaseMessage[] | {
25
- messages: (BaseMessage | ChatCompletionMessage)[];
24
+ outputs: FlexibleChatCompletionMessage[] | BaseMessage[] | {
25
+ messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
26
26
  };
27
- referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
28
- messages: (BaseMessage | ChatCompletionMessage)[];
27
+ referenceOutputs: FlexibleChatCompletionMessage[] | BaseMessage[] | {
28
+ messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
29
29
  };
30
30
  }): Promise<EvaluatorResult>;
@@ -1,5 +1,5 @@
1
1
  import { BaseMessage } from "@langchain/core/messages";
2
- import { ChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
2
+ import { ChatCompletionMessage, FlexibleChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
3
3
  export declare const _scorer: (params: {
4
4
  outputs: ChatCompletionMessage[];
5
5
  referenceOutputs: ChatCompletionMessage[];
@@ -21,10 +21,10 @@ export declare const _scorer: (params: {
21
21
  * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
22
22
  */
23
23
  export declare function trajectorySuperset(params: {
24
- outputs: ChatCompletionMessage[] | BaseMessage[] | {
25
- messages: (BaseMessage | ChatCompletionMessage)[];
24
+ outputs: FlexibleChatCompletionMessage[] | BaseMessage[] | {
25
+ messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
26
26
  };
27
- referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
28
- messages: (BaseMessage | ChatCompletionMessage)[];
27
+ referenceOutputs: FlexibleChatCompletionMessage[] | BaseMessage[] | {
28
+ messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
29
29
  };
30
30
  }): Promise<EvaluatorResult>;
@@ -1,5 +1,5 @@
1
1
  import { BaseMessage } from "@langchain/core/messages";
2
- import { ChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
2
+ import { ChatCompletionMessage, FlexibleChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides } from "../types.js";
3
3
  export declare const _scorer: (params: {
4
4
  outputs: ChatCompletionMessage[];
5
5
  referenceOutputs: ChatCompletionMessage[];
@@ -21,10 +21,10 @@ export declare const _scorer: (params: {
21
21
  * @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
22
22
  */
23
23
  export declare function trajectoryUnorderedMatch(params: {
24
- outputs: ChatCompletionMessage[] | BaseMessage[] | {
25
- messages: (BaseMessage | ChatCompletionMessage)[];
24
+ outputs: FlexibleChatCompletionMessage[] | BaseMessage[] | {
25
+ messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
26
26
  };
27
- referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
28
- messages: (BaseMessage | ChatCompletionMessage)[];
27
+ referenceOutputs: FlexibleChatCompletionMessage[] | BaseMessage[] | {
28
+ messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
29
29
  };
30
30
  }): Promise<EvaluatorResult>;
package/dist/types.d.ts CHANGED
@@ -1,5 +1,20 @@
1
1
  import { createLLMAsJudge } from "openevals/llm";
2
2
  export * from "openevals/types";
3
+ export type FlexibleChatCompletionMessage = Record<string, any> & ({
4
+ content: any;
5
+ role: "user" | "system" | "developer";
6
+ id?: string;
7
+ } | {
8
+ role: "assistant";
9
+ content: any;
10
+ tool_calls?: any[];
11
+ id?: string;
12
+ } | {
13
+ role: "tool";
14
+ content: any;
15
+ tool_call_id?: string;
16
+ id?: string;
17
+ });
3
18
  export type GraphTrajectory = {
4
19
  inputs?: (Record<string, unknown> | null)[];
5
20
  results: Record<string, unknown>[];
@@ -9,8 +24,8 @@ export type ExtractedLangGraphThreadTrajectory = {
9
24
  inputs: (Record<string, unknown> | null)[][];
10
25
  outputs: GraphTrajectory;
11
26
  };
12
- export type TrajectoryLLMAsJudgeParams = Omit<Parameters<typeof createLLMAsJudge>[0], "prompt"> & {
13
- prompt?: string;
27
+ export type TrajectoryLLMAsJudgeParams = Partial<Omit<Parameters<typeof createLLMAsJudge>[0], "prompt">> & {
28
+ prompt?: Parameters<typeof createLLMAsJudge>[0]["prompt"];
14
29
  };
15
30
  export type ToolArgsMatchMode = "exact" | "ignore" | "subset" | "superset";
16
31
  export type ToolArgsMatcher = (toolCall: Record<string, unknown>, referenceToolCall: Record<string, unknown>) => boolean | Promise<boolean>;
package/dist/utils.cjs CHANGED
@@ -1,6 +1,6 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports._runEvaluator = exports.processScore = exports._normalizeToOpenAIMessagesList = exports._convertToOpenAIMessage = void 0;
3
+ exports._runEvaluator = exports.processScore = exports._normalizeToOpenAIMessagesList = exports._convertToChatCompletionMessage = exports._convertToOpenAIMessage = void 0;
4
4
  const messages_1 = require("@langchain/core/messages");
5
5
  const openai_1 = require("@langchain/openai");
6
6
  const utils_1 = require("openevals/utils");
@@ -14,6 +14,25 @@ const _convertToOpenAIMessage = (message) => {
14
14
  }
15
15
  };
16
16
  exports._convertToOpenAIMessage = _convertToOpenAIMessage;
17
+ const _convertToChatCompletionMessage = (message) => {
18
+ let converted;
19
+ if ((0, messages_1.isBaseMessage)(message)) {
20
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
21
+ converted = (0, openai_1._convertMessagesToOpenAIParams)([message])[0];
22
+ }
23
+ else {
24
+ converted = message;
25
+ }
26
+ // For tool messages without tool_call_id, generate one for compatibility
27
+ if (converted.role === "tool" && !converted.tool_call_id) {
28
+ converted = {
29
+ ...converted,
30
+ tool_call_id: `generated-${Math.random().toString(36).substring(2)}`,
31
+ };
32
+ }
33
+ return converted;
34
+ };
35
+ exports._convertToChatCompletionMessage = _convertToChatCompletionMessage;
17
36
  const _normalizeToOpenAIMessagesList = (messages) => {
18
37
  if (!messages) {
19
38
  return [];
@@ -30,7 +49,7 @@ const _normalizeToOpenAIMessagesList = (messages) => {
30
49
  else {
31
50
  messagesList = messages;
32
51
  }
33
- return messagesList.map(exports._convertToOpenAIMessage);
52
+ return messagesList.map(exports._convertToChatCompletionMessage);
34
53
  };
35
54
  exports._normalizeToOpenAIMessagesList = _normalizeToOpenAIMessagesList;
36
55
  const processScore = (_, value) => {
package/dist/utils.d.ts CHANGED
@@ -1,9 +1,10 @@
1
1
  import { BaseMessage } from "@langchain/core/messages";
2
2
  import { EvaluationResultType } from "openevals/utils";
3
- import { ChatCompletionMessage, MultiResultScorerReturnType, SingleResultScorerReturnType } from "./types.js";
3
+ import { ChatCompletionMessage, FlexibleChatCompletionMessage, MultiResultScorerReturnType, SingleResultScorerReturnType } from "./types.js";
4
4
  export declare const _convertToOpenAIMessage: (message: BaseMessage | ChatCompletionMessage) => ChatCompletionMessage;
5
- export declare const _normalizeToOpenAIMessagesList: (messages?: (BaseMessage | ChatCompletionMessage)[] | {
6
- messages: (BaseMessage | ChatCompletionMessage)[];
5
+ export declare const _convertToChatCompletionMessage: (message: BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage) => ChatCompletionMessage;
6
+ export declare const _normalizeToOpenAIMessagesList: (messages?: (FlexibleChatCompletionMessage | ChatCompletionMessage | BaseMessage)[] | {
7
+ messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
7
8
  } | undefined) => ChatCompletionMessage[];
8
9
  export declare const processScore: (_: string, value: boolean | number | {
9
10
  score: boolean | number;
package/dist/utils.js CHANGED
@@ -10,6 +10,24 @@ export const _convertToOpenAIMessage = (message) => {
10
10
  return message;
11
11
  }
12
12
  };
13
+ export const _convertToChatCompletionMessage = (message) => {
14
+ let converted;
15
+ if (isBaseMessage(message)) {
16
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
17
+ converted = _convertMessagesToOpenAIParams([message])[0];
18
+ }
19
+ else {
20
+ converted = message;
21
+ }
22
+ // For tool messages without tool_call_id, generate one for compatibility
23
+ if (converted.role === "tool" && !converted.tool_call_id) {
24
+ converted = {
25
+ ...converted,
26
+ tool_call_id: `generated-${Math.random().toString(36).substring(2)}`,
27
+ };
28
+ }
29
+ return converted;
30
+ };
13
31
  export const _normalizeToOpenAIMessagesList = (messages) => {
14
32
  if (!messages) {
15
33
  return [];
@@ -26,7 +44,7 @@ export const _normalizeToOpenAIMessagesList = (messages) => {
26
44
  else {
27
45
  messagesList = messages;
28
46
  }
29
- return messagesList.map(_convertToOpenAIMessage);
47
+ return messagesList.map(_convertToChatCompletionMessage);
30
48
  };
31
49
  export const processScore = (_, value) => {
32
50
  if (typeof value === "object") {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentevals",
3
- "version": "0.0.5",
3
+ "version": "0.0.6",
4
4
  "packageManager": "yarn@3.5.1",
5
5
  "type": "module",
6
6
  "scripts": {
@@ -14,18 +14,18 @@
14
14
  "test": "vitest run"
15
15
  },
16
16
  "dependencies": {
17
- "@langchain/openai": "^0.4.4",
18
- "langchain": "^0.3.18",
19
- "langsmith": "^0.3.11",
20
- "openevals": "^0.0.3"
17
+ "@langchain/openai": ">=0.4.4",
18
+ "langchain": ">=0.3.18",
19
+ "langsmith": ">=0.3.11",
20
+ "openevals": "^0.1.0"
21
21
  },
22
22
  "peerDependencies": {
23
- "@langchain/core": "^0.3.40",
24
- "@langchain/langgraph": "^0.2.46"
23
+ "@langchain/core": ">=0.3.73",
24
+ "@langchain/langgraph": ">=0.2.46"
25
25
  },
26
26
  "devDependencies": {
27
- "@langchain/core": "^0.3.40",
28
- "@langchain/langgraph": "^0.2.46",
27
+ "@langchain/core": "^0.3.73",
28
+ "@langchain/langgraph": "^0.4.9",
29
29
  "@langchain/scripts": "0.1.3",
30
30
  "@tsconfig/recommended": "^1.0.8",
31
31
  "@typescript-eslint/eslint-plugin": "^8.24.1",
@@ -43,7 +43,7 @@
43
43
  "prettier": "^3.5.1",
44
44
  "typescript": "~5.1.6",
45
45
  "vitest": "^3.0.5",
46
- "zod": "^3.24.2"
46
+ "zod": "^4.1.5"
47
47
  },
48
48
  "files": [
49
49
  "dist/",