agentevals 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/graph_trajectory/utils.cjs +4 -4
- package/dist/graph_trajectory/utils.js +4 -4
- package/dist/trajectory/llm.d.ts +12 -10
- package/dist/utils.cjs +43 -5
- package/dist/utils.d.ts +6 -4
- package/dist/utils.js +20 -5
- package/package.json +23 -8
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.extractLangGraphTrajectoryFromThread = exports._getLangGraphStateHistoryRecursive = exports.extractLangGraphTrajectoryFromSnapshots = void 0;
|
|
4
4
|
const messages_1 = require("@langchain/core/messages");
|
|
5
|
-
const
|
|
5
|
+
const utils_js_1 = require("../utils.cjs");
|
|
6
6
|
const extractLangGraphTrajectoryFromSnapshots = (snapshots) => {
|
|
7
7
|
const inputs = [];
|
|
8
8
|
const trajectory = {
|
|
@@ -29,7 +29,7 @@ const extractLangGraphTrajectoryFromSnapshots = (snapshots) => {
|
|
|
29
29
|
if ((0, messages_1.isBaseMessage)(lastMessage)) {
|
|
30
30
|
// Just append the last message in the output to the results to reduce context size
|
|
31
31
|
trajectory.results.push({
|
|
32
|
-
messages: (0,
|
|
32
|
+
messages: [(0, utils_js_1._convertToOpenAIMessage)(lastMessage)],
|
|
33
33
|
});
|
|
34
34
|
}
|
|
35
35
|
else {
|
|
@@ -49,9 +49,9 @@ const extractLangGraphTrajectoryFromSnapshots = (snapshots) => {
|
|
|
49
49
|
}
|
|
50
50
|
for (const task of snapshot.tasks) {
|
|
51
51
|
if (task.interrupts?.length) {
|
|
52
|
-
trajectory.steps.
|
|
52
|
+
trajectory.steps[trajectory.steps.length - 1]?.push("__interrupt__");
|
|
53
53
|
}
|
|
54
|
-
trajectory.steps.
|
|
54
|
+
trajectory.steps[trajectory.steps.length - 1]?.push(`${subgraphPath}${task.name}`);
|
|
55
55
|
}
|
|
56
56
|
}
|
|
57
57
|
if (isAccumulatingSteps) {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { isBaseMessage } from "@langchain/core/messages";
|
|
2
|
-
import {
|
|
2
|
+
import { _convertToOpenAIMessage } from "../utils.js";
|
|
3
3
|
export const extractLangGraphTrajectoryFromSnapshots = (snapshots) => {
|
|
4
4
|
const inputs = [];
|
|
5
5
|
const trajectory = {
|
|
@@ -26,7 +26,7 @@ export const extractLangGraphTrajectoryFromSnapshots = (snapshots) => {
|
|
|
26
26
|
if (isBaseMessage(lastMessage)) {
|
|
27
27
|
// Just append the last message in the output to the results to reduce context size
|
|
28
28
|
trajectory.results.push({
|
|
29
|
-
messages:
|
|
29
|
+
messages: [_convertToOpenAIMessage(lastMessage)],
|
|
30
30
|
});
|
|
31
31
|
}
|
|
32
32
|
else {
|
|
@@ -46,9 +46,9 @@ export const extractLangGraphTrajectoryFromSnapshots = (snapshots) => {
|
|
|
46
46
|
}
|
|
47
47
|
for (const task of snapshot.tasks) {
|
|
48
48
|
if (task.interrupts?.length) {
|
|
49
|
-
trajectory.steps.
|
|
49
|
+
trajectory.steps[trajectory.steps.length - 1]?.push("__interrupt__");
|
|
50
50
|
}
|
|
51
|
-
trajectory.steps.
|
|
51
|
+
trajectory.steps[trajectory.steps.length - 1]?.push(`${subgraphPath}${task.name}`);
|
|
52
52
|
}
|
|
53
53
|
}
|
|
54
54
|
if (isAccumulatingSteps) {
|
package/dist/trajectory/llm.d.ts
CHANGED
|
@@ -1,5 +1,14 @@
|
|
|
1
|
-
import { BaseMessage } from "@langchain/core/messages";
|
|
1
|
+
import type { BaseMessage } from "@langchain/core/messages";
|
|
2
2
|
import { ChatCompletionMessage, FlexibleChatCompletionMessage, EvaluatorResult, TrajectoryLLMAsJudgeParams } from "../types.js";
|
|
3
|
+
type TrajectoryEvaluatorFunction = (params: {
|
|
4
|
+
outputs: ChatCompletionMessage[] | FlexibleChatCompletionMessage[] | BaseMessage[] | {
|
|
5
|
+
messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
|
|
6
|
+
};
|
|
7
|
+
referenceOutputs?: ChatCompletionMessage[] | FlexibleChatCompletionMessage[] | BaseMessage[] | {
|
|
8
|
+
messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
|
|
9
|
+
};
|
|
10
|
+
[key: string]: unknown;
|
|
11
|
+
}) => Promise<EvaluatorResult>;
|
|
3
12
|
export declare const TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n An accurate trajectory:\n - Makes logical sense between steps\n - Shows clear progression\n - Is relatively efficient, though it does not need to be perfectly efficient\n - Is semantically equivalent to the provided reference trajectory\n</Rubric>\n\nBased on the following reference trajectory:\n\n<reference_trajectory>\n{reference_outputs}\n</reference_trajectory>\n\nGrade this actual trajectory:\n\n<trajectory>\n{outputs}\n</trajectory>\n";
|
|
4
13
|
export declare const TRAJECTORY_ACCURACY_PROMPT = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n An accurate trajectory:\n - Makes logical sense between steps\n - Shows clear progression\n - Is relatively efficient, though it does not need to be perfectly efficient\n</Rubric>\n\nFirst, try to understand the goal of the trajectory by looking at the input\n(if the input is not present try to infer it from the content of the first message),\nas well as the output of the final message. Once you understand the goal, grade the trajectory\nas it relates to achieving that goal.\n\nGrade the following trajectory:\n\n<trajectory>\n{outputs}\n</trajectory>";
|
|
5
14
|
/**
|
|
@@ -23,12 +32,5 @@ export declare const TRAJECTORY_ACCURACY_PROMPT = "You are an expert data labele
|
|
|
23
32
|
* @param options.fewShotExamples - Optional list of example evaluations to append to the prompt.
|
|
24
33
|
* @returns A function that evaluates agent trajectories using the configured LLM judge.
|
|
25
34
|
*/
|
|
26
|
-
export declare const createTrajectoryLLMAsJudge: ({ prompt, feedbackKey, model, system, judge, continuous, choices, useReasoning, fewShotExamples, }: TrajectoryLLMAsJudgeParams) =>
|
|
27
|
-
|
|
28
|
-
outputs: ChatCompletionMessage[] | FlexibleChatCompletionMessage[] | BaseMessage[] | {
|
|
29
|
-
messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
|
|
30
|
-
};
|
|
31
|
-
referenceOutputs?: ChatCompletionMessage[] | BaseMessage[] | FlexibleChatCompletionMessage[] | {
|
|
32
|
-
messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
|
|
33
|
-
} | undefined;
|
|
34
|
-
}) => Promise<EvaluatorResult>;
|
|
35
|
+
export declare const createTrajectoryLLMAsJudge: ({ prompt, feedbackKey, model, system, judge, continuous, choices, useReasoning, fewShotExamples, }: TrajectoryLLMAsJudgeParams) => TrajectoryEvaluatorFunction;
|
|
36
|
+
export {};
|
package/dist/utils.cjs
CHANGED
|
@@ -1,13 +1,52 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || function (mod) {
|
|
19
|
+
if (mod && mod.__esModule) return mod;
|
|
20
|
+
var result = {};
|
|
21
|
+
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
|
|
22
|
+
__setModuleDefault(result, mod);
|
|
23
|
+
return result;
|
|
24
|
+
};
|
|
2
25
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
26
|
exports._runEvaluator = exports.processScore = exports._normalizeToOpenAIMessagesList = exports._convertToChatCompletionMessage = exports._convertToOpenAIMessage = void 0;
|
|
4
27
|
const messages_1 = require("@langchain/core/messages");
|
|
5
|
-
const
|
|
28
|
+
const openAIImports = __importStar(require("@langchain/openai"));
|
|
6
29
|
const utils_1 = require("openevals/utils");
|
|
30
|
+
const {
|
|
31
|
+
// @ts-expect-error Shim for older versions of @langchain/openai
|
|
32
|
+
_convertMessagesToOpenAIParams, convertMessagesToCompletionsMessageParams, } = openAIImports;
|
|
33
|
+
function _convertMessagesShim(message) {
|
|
34
|
+
if (typeof _convertMessagesToOpenAIParams === "function") {
|
|
35
|
+
return _convertMessagesToOpenAIParams([
|
|
36
|
+
message,
|
|
37
|
+
])[0];
|
|
38
|
+
}
|
|
39
|
+
return convertMessagesToCompletionsMessageParams({
|
|
40
|
+
messages: [message],
|
|
41
|
+
})[0];
|
|
42
|
+
}
|
|
7
43
|
const _convertToOpenAIMessage = (message) => {
|
|
8
44
|
if ((0, messages_1.isBaseMessage)(message)) {
|
|
9
|
-
|
|
10
|
-
|
|
45
|
+
const converted = _convertMessagesShim(message);
|
|
46
|
+
if (message.id && !converted.id) {
|
|
47
|
+
converted.id = message.id;
|
|
48
|
+
}
|
|
49
|
+
return converted;
|
|
11
50
|
}
|
|
12
51
|
else {
|
|
13
52
|
return message;
|
|
@@ -17,8 +56,7 @@ exports._convertToOpenAIMessage = _convertToOpenAIMessage;
|
|
|
17
56
|
const _convertToChatCompletionMessage = (message) => {
|
|
18
57
|
let converted;
|
|
19
58
|
if ((0, messages_1.isBaseMessage)(message)) {
|
|
20
|
-
|
|
21
|
-
converted = (0, openai_1._convertMessagesToOpenAIParams)([message])[0];
|
|
59
|
+
converted = _convertMessagesShim(message);
|
|
22
60
|
}
|
|
23
61
|
else {
|
|
24
62
|
converted = message;
|
package/dist/utils.d.ts
CHANGED
|
@@ -1,13 +1,15 @@
|
|
|
1
|
-
import { BaseMessage } from "@langchain/core/messages";
|
|
1
|
+
import type { BaseMessage } from "@langchain/core/messages";
|
|
2
2
|
import { EvaluationResultType } from "openevals/utils";
|
|
3
3
|
import { ChatCompletionMessage, FlexibleChatCompletionMessage, MultiResultScorerReturnType, SingleResultScorerReturnType } from "./types.js";
|
|
4
|
+
type NormalizeToOpenAIMessagesListFunction = (messages?: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[] | {
|
|
5
|
+
messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
|
|
6
|
+
}) => ChatCompletionMessage[];
|
|
4
7
|
export declare const _convertToOpenAIMessage: (message: BaseMessage | ChatCompletionMessage) => ChatCompletionMessage;
|
|
5
8
|
export declare const _convertToChatCompletionMessage: (message: BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage) => ChatCompletionMessage;
|
|
6
|
-
export declare const _normalizeToOpenAIMessagesList:
|
|
7
|
-
messages: (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[];
|
|
8
|
-
} | undefined) => ChatCompletionMessage[];
|
|
9
|
+
export declare const _normalizeToOpenAIMessagesList: NormalizeToOpenAIMessagesListFunction;
|
|
9
10
|
export declare const processScore: (_: string, value: boolean | number | {
|
|
10
11
|
score: boolean | number;
|
|
11
12
|
reasoning?: string;
|
|
12
13
|
}) => readonly [number | boolean, string | undefined] | readonly [number | boolean];
|
|
13
14
|
export declare const _runEvaluator: <T extends Record<string, unknown>, O extends SingleResultScorerReturnType | MultiResultScorerReturnType | Promise<SingleResultScorerReturnType | MultiResultScorerReturnType>>(runName: string, scorer: (params: T) => O, feedbackKey: string, extra?: T | undefined) => Promise<EvaluationResultType<O>>;
|
|
15
|
+
export {};
|
package/dist/utils.js
CHANGED
|
@@ -1,10 +1,26 @@
|
|
|
1
1
|
import { isBaseMessage } from "@langchain/core/messages";
|
|
2
|
-
import
|
|
2
|
+
import * as openAIImports from "@langchain/openai";
|
|
3
3
|
import { _runEvaluator as baseRunEvaluator, } from "openevals/utils";
|
|
4
|
+
const {
|
|
5
|
+
// @ts-expect-error Shim for older versions of @langchain/openai
|
|
6
|
+
_convertMessagesToOpenAIParams, convertMessagesToCompletionsMessageParams, } = openAIImports;
|
|
7
|
+
function _convertMessagesShim(message) {
|
|
8
|
+
if (typeof _convertMessagesToOpenAIParams === "function") {
|
|
9
|
+
return _convertMessagesToOpenAIParams([
|
|
10
|
+
message,
|
|
11
|
+
])[0];
|
|
12
|
+
}
|
|
13
|
+
return convertMessagesToCompletionsMessageParams({
|
|
14
|
+
messages: [message],
|
|
15
|
+
})[0];
|
|
16
|
+
}
|
|
4
17
|
export const _convertToOpenAIMessage = (message) => {
|
|
5
18
|
if (isBaseMessage(message)) {
|
|
6
|
-
|
|
7
|
-
|
|
19
|
+
const converted = _convertMessagesShim(message);
|
|
20
|
+
if (message.id && !converted.id) {
|
|
21
|
+
converted.id = message.id;
|
|
22
|
+
}
|
|
23
|
+
return converted;
|
|
8
24
|
}
|
|
9
25
|
else {
|
|
10
26
|
return message;
|
|
@@ -13,8 +29,7 @@ export const _convertToOpenAIMessage = (message) => {
|
|
|
13
29
|
export const _convertToChatCompletionMessage = (message) => {
|
|
14
30
|
let converted;
|
|
15
31
|
if (isBaseMessage(message)) {
|
|
16
|
-
|
|
17
|
-
converted = _convertMessagesToOpenAIParams([message])[0];
|
|
32
|
+
converted = _convertMessagesShim(message);
|
|
18
33
|
}
|
|
19
34
|
else {
|
|
20
35
|
converted = message;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentevals",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.7",
|
|
4
4
|
"packageManager": "yarn@3.5.1",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"scripts": {
|
|
@@ -15,17 +15,18 @@
|
|
|
15
15
|
},
|
|
16
16
|
"dependencies": {
|
|
17
17
|
"@langchain/openai": ">=0.4.4",
|
|
18
|
-
"langchain": ">=
|
|
19
|
-
"langsmith": ">=0.
|
|
20
|
-
"openevals": "^0.1.
|
|
18
|
+
"langchain": ">=1.2.28",
|
|
19
|
+
"langsmith": ">=0.4.6",
|
|
20
|
+
"openevals": "^0.1.4"
|
|
21
21
|
},
|
|
22
22
|
"peerDependencies": {
|
|
23
|
-
"@langchain/core": ">=0.3.
|
|
23
|
+
"@langchain/core": ">=0.3.80",
|
|
24
24
|
"@langchain/langgraph": ">=0.2.46"
|
|
25
25
|
},
|
|
26
26
|
"devDependencies": {
|
|
27
|
-
"@langchain/core": "^
|
|
28
|
-
"@langchain/langgraph": "^
|
|
27
|
+
"@langchain/core": "^1.1.29",
|
|
28
|
+
"@langchain/langgraph": "^1.2.0",
|
|
29
|
+
"@langchain/openai": "^1.2.11",
|
|
29
30
|
"@langchain/scripts": "0.1.3",
|
|
30
31
|
"@tsconfig/recommended": "^1.0.8",
|
|
31
32
|
"@typescript-eslint/eslint-plugin": "^8.24.1",
|
|
@@ -39,12 +40,26 @@
|
|
|
39
40
|
"eslint-plugin-jest": "^27.6.0",
|
|
40
41
|
"eslint-plugin-no-instanceof": "^1.0.1",
|
|
41
42
|
"eslint-plugin-prettier": "^4.2.1",
|
|
42
|
-
"openai": "^
|
|
43
|
+
"openai": "^6.25.0",
|
|
43
44
|
"prettier": "^3.5.1",
|
|
44
45
|
"typescript": "~5.1.6",
|
|
45
46
|
"vitest": "^3.0.5",
|
|
46
47
|
"zod": "^4.1.5"
|
|
47
48
|
},
|
|
49
|
+
"resolutions": {
|
|
50
|
+
"form-data": "^4.0.4",
|
|
51
|
+
"tar": "^7.5.8",
|
|
52
|
+
"axios": "^1.8.2",
|
|
53
|
+
"lodash": "^4.17.23",
|
|
54
|
+
"js-yaml": "^4.1.1",
|
|
55
|
+
"vite": "^6.4.1",
|
|
56
|
+
"esbuild": "^0.25.0",
|
|
57
|
+
"@langchain/scripts/glob": "^10.5.0",
|
|
58
|
+
"cacache/glob": "^10.5.0",
|
|
59
|
+
"dpdm/glob": "^10.5.0",
|
|
60
|
+
"node-gyp/glob": "^10.5.0",
|
|
61
|
+
"rimraf@5.0.10/glob": "^10.5.0"
|
|
62
|
+
},
|
|
48
63
|
"files": [
|
|
49
64
|
"dist/",
|
|
50
65
|
"index.cjs",
|