agentevals 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/graph_trajectory/llm.cjs +113 -0
- package/dist/graph_trajectory/llm.d.ts +30 -0
- package/dist/graph_trajectory/llm.js +109 -0
- package/dist/graph_trajectory/strict.cjs +40 -0
- package/dist/graph_trajectory/strict.d.ts +14 -0
- package/dist/graph_trajectory/strict.js +36 -0
- package/dist/graph_trajectory/utils.cjs +105 -0
- package/dist/graph_trajectory/utils.d.ts +12 -0
- package/dist/graph_trajectory/utils.js +99 -0
- package/dist/index.cjs +35 -0
- package/dist/index.d.ts +4 -1
- package/dist/index.js +4 -1
- package/dist/trajectory/llm.cjs +106 -0
- package/dist/trajectory/llm.d.ts +5 -19
- package/dist/trajectory/llm.js +43 -28
- package/dist/trajectory/strict.cjs +74 -0
- package/dist/trajectory/strict.d.ts +2 -0
- package/dist/trajectory/strict.js +18 -2
- package/dist/trajectory/subset.cjs +32 -0
- package/dist/trajectory/superset.cjs +32 -0
- package/dist/trajectory/unordered.cjs +33 -0
- package/dist/trajectory/utils.cjs +69 -0
- package/dist/trajectory/utils.js +1 -0
- package/dist/types.cjs +17 -0
- package/dist/types.d.ts +11 -41
- package/dist/types.js +1 -1
- package/dist/utils.cjs +56 -0
- package/dist/utils.d.ts +2 -3
- package/dist/utils.js +3 -35
- package/index.cjs +1 -0
- package/index.d.ts +1 -0
- package/index.js +1 -0
- package/package.json +2 -2
- package/dist/trajectory/tests/trajectory.test.d.ts +0 -1
- package/dist/trajectory/tests/trajectory.test.js +0 -724
- package/dist/trajectory/tests/trajectory_llm.test.d.ts +0 -1
- package/dist/trajectory/tests/trajectory_llm.test.js +0 -110
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.createGraphTrajectoryLLMAsJudge = exports.GRAPH_TRAJECTORY_ACCURACY_PROMPT = void 0;
|
|
4
|
+
const llm_1 = require("openevals/llm");
|
|
5
|
+
const utils_js_1 = require("../utils.cjs");
|
|
6
|
+
exports.GRAPH_TRAJECTORY_ACCURACY_PROMPT = `You are an expert data labeler.
|
|
7
|
+
Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries.
|
|
8
|
+
|
|
9
|
+
<Rubric>
|
|
10
|
+
An accurate trajectory:
|
|
11
|
+
- Makes logical sense between steps
|
|
12
|
+
- Shows clear progression
|
|
13
|
+
- Is relatively efficient, though it does not need to be perfectly efficient
|
|
14
|
+
- Is semantically equivalent to the provided reference trajectory, if present
|
|
15
|
+
</Rubric>
|
|
16
|
+
|
|
17
|
+
<Instructions>
|
|
18
|
+
Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient.
|
|
19
|
+
For the trajectory, "__start__" denotes an initial entrypoint to the agent, and "__interrupt__" corresponds to the agent
|
|
20
|
+
interrupting to await additional data from another source ("human-in-the-loop").
|
|
21
|
+
|
|
22
|
+
Steps containing a colon represent steps within subagents (e.g. "graph:step_name").
|
|
23
|
+
</Instructions>
|
|
24
|
+
|
|
25
|
+
<thread>
|
|
26
|
+
{thread}
|
|
27
|
+
</thread>
|
|
28
|
+
|
|
29
|
+
{reference_outputs}
|
|
30
|
+
`;
|
|
31
|
+
function _formatThread(inputs, outputs) {
|
|
32
|
+
let formattedThread = "";
|
|
33
|
+
const zippedData = inputs.map((input, i) => ({
|
|
34
|
+
input: JSON.stringify(input ?? ""),
|
|
35
|
+
result: JSON.stringify(outputs.results[i]),
|
|
36
|
+
step: JSON.stringify(outputs.steps[i]),
|
|
37
|
+
}));
|
|
38
|
+
for (const { input, result, step } of zippedData) {
|
|
39
|
+
formattedThread += input ? `\n<input>\n${input}\n</input>\n` : "";
|
|
40
|
+
formattedThread += `\n<trajectory>\n${step}\n</trajectory>\n`;
|
|
41
|
+
formattedThread += `\n<result>\n${result}\n</result>\n`;
|
|
42
|
+
}
|
|
43
|
+
return formattedThread;
|
|
44
|
+
}
|
|
45
|
+
function _formatInputs(inputs, outputs, referenceOutputs) {
|
|
46
|
+
let processedInputs;
|
|
47
|
+
if (Array.isArray(inputs)) {
|
|
48
|
+
processedInputs = inputs;
|
|
49
|
+
}
|
|
50
|
+
else {
|
|
51
|
+
if (!("inputs" in inputs)) {
|
|
52
|
+
throw new Error("inputs must be an array or an object with an 'inputs' key");
|
|
53
|
+
}
|
|
54
|
+
processedInputs = inputs.inputs;
|
|
55
|
+
}
|
|
56
|
+
if (processedInputs.length !== outputs.results.length) {
|
|
57
|
+
throw new Error("Provided `inputs` and `results` within provided `outputs` must have the same length");
|
|
58
|
+
}
|
|
59
|
+
if (processedInputs.length !== outputs.steps.length) {
|
|
60
|
+
throw new Error("Provided `inputs` and `steps` within provided `outputs` must have the same length");
|
|
61
|
+
}
|
|
62
|
+
const formattedThread = _formatThread(processedInputs, outputs);
|
|
63
|
+
const formattedReferenceOutputs = referenceOutputs
|
|
64
|
+
? `\nUse the following trajectory as an example reference when grading:\n<reference_thread>\n${_formatThread(referenceOutputs.inputs ?? [], referenceOutputs)}\n</reference_thread>\n`
|
|
65
|
+
: "";
|
|
66
|
+
return {
|
|
67
|
+
formattedThread,
|
|
68
|
+
formattedReferenceOutputs,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Creates an evaluator that uses an LLM to judge agent trajectories.
|
|
73
|
+
* @param options Configuration options
|
|
74
|
+
* @param [options.prompt] - The evaluation prompt. Can be a string template,
|
|
75
|
+
* LangChain prompt template, or callable that returns a list of chat messages. Note that the default prompt allows a rubric
|
|
76
|
+
* in addition to the typical "inputs", "outputs", and "reference_outputs" parameters.
|
|
77
|
+
* @param [options.feedbackKey="graph_trajectory_accuracy"] - Key used to store the evaluation result
|
|
78
|
+
* @param [options.judge] - The LLM used for evaluation. Can be an OpenAI client
|
|
79
|
+
* or a LangChain chat model. If an OpenAI client, must specify "model" as well.
|
|
80
|
+
* If omitted, "model" will be used to instantiate a LangChain model instance by model string.
|
|
81
|
+
* @param [options.model] - Model identifier to use. If "judge" is an OpenAI client,
|
|
82
|
+
* this argument should be a model name directly. If "judge" is omitted, must be a valid
|
|
83
|
+
* LangChain model identifier. See `init_chat_model` docs for more details:
|
|
84
|
+
* https://python.langchain.com/docs/how_to/chat_models_universal_init/
|
|
85
|
+
* @param [options.continuous=false] - If true, score will be a float between 0 and 1. If false, score will be boolean.
|
|
86
|
+
* @param [options.choices] - Optional list of specific float values the score must be chosen from
|
|
87
|
+
* @param [options.useReasoning=true] - If true, includes explanation for the score in the output
|
|
88
|
+
* @param [options.fewShotExamples] - Optional list of example evaluations to append to the prompt
|
|
89
|
+
* @returns A function that evaluates agent trajectories using the configured LLM judge
|
|
90
|
+
*/
|
|
91
|
+
const createGraphTrajectoryLLMAsJudge = ({ prompt = exports.GRAPH_TRAJECTORY_ACCURACY_PROMPT, model, feedbackKey = "graph_trajectory_accuracy", judge, continuous = false, choices, useReasoning = true, fewShotExamples, }) => {
|
|
92
|
+
const scorer = (0, llm_1._createLLMAsJudgeScorer)({
|
|
93
|
+
prompt,
|
|
94
|
+
judge,
|
|
95
|
+
model,
|
|
96
|
+
continuous,
|
|
97
|
+
choices,
|
|
98
|
+
useReasoning,
|
|
99
|
+
fewShotExamples,
|
|
100
|
+
});
|
|
101
|
+
const _wrappedEvaluator = async ({ inputs, outputs, referenceOutputs, ...extra }) => {
|
|
102
|
+
const { formattedThread, formattedReferenceOutputs } = _formatInputs(inputs, outputs, referenceOutputs);
|
|
103
|
+
return (0, utils_js_1._runEvaluator)(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, {
|
|
104
|
+
outputs,
|
|
105
|
+
inputs,
|
|
106
|
+
thread: formattedThread,
|
|
107
|
+
referenceOutputs: formattedReferenceOutputs,
|
|
108
|
+
...extra,
|
|
109
|
+
});
|
|
110
|
+
};
|
|
111
|
+
return _wrappedEvaluator;
|
|
112
|
+
};
|
|
113
|
+
exports.createGraphTrajectoryLLMAsJudge = createGraphTrajectoryLLMAsJudge;
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import type { GraphTrajectory, TrajectoryLLMAsJudgeParams } from "../types.js";
|
|
2
|
+
export declare const GRAPH_TRAJECTORY_ACCURACY_PROMPT = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal steps in resolving a user queries.\n\n<Rubric>\n An accurate trajectory:\n - Makes logical sense between steps\n - Shows clear progression\n - Is relatively efficient, though it does not need to be perfectly efficient\n - Is semantically equivalent to the provided reference trajectory, if present\n</Rubric>\n\n<Instructions>\n Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient.\n For the trajectory, \"__start__\" denotes an initial entrypoint to the agent, and \"__interrupt__\" corresponds to the agent\n interrupting to await additional data from another source (\"human-in-the-loop\").\n \n Steps containing a colon represent steps within subagents (e.g. \"graph:step_name\").\n</Instructions>\n\n<thread>\n{thread}\n</thread>\n\n{reference_outputs}\n";
|
|
3
|
+
/**
|
|
4
|
+
* Creates an evaluator that uses an LLM to judge agent trajectories.
|
|
5
|
+
* @param options Configuration options
|
|
6
|
+
* @param [options.prompt] - The evaluation prompt. Can be a string template,
|
|
7
|
+
* LangChain prompt template, or callable that returns a list of chat messages. Note that the default prompt allows a rubric
|
|
8
|
+
* in addition to the typical "inputs", "outputs", and "reference_outputs" parameters.
|
|
9
|
+
* @param [options.feedbackKey="graph_trajectory_accuracy"] - Key used to store the evaluation result
|
|
10
|
+
* @param [options.judge] - The LLM used for evaluation. Can be an OpenAI client
|
|
11
|
+
* or a LangChain chat model. If an OpenAI client, must specify "model" as well.
|
|
12
|
+
* If omitted, "model" will be used to instantiate a LangChain model instance by model string.
|
|
13
|
+
* @param [options.model] - Model identifier to use. If "judge" is an OpenAI client,
|
|
14
|
+
* this argument should be a model name directly. If "judge" is omitted, must be a valid
|
|
15
|
+
* LangChain model identifier. See `init_chat_model` docs for more details:
|
|
16
|
+
* https://python.langchain.com/docs/how_to/chat_models_universal_init/
|
|
17
|
+
* @param [options.continuous=false] - If true, score will be a float between 0 and 1. If false, score will be boolean.
|
|
18
|
+
* @param [options.choices] - Optional list of specific float values the score must be chosen from
|
|
19
|
+
* @param [options.useReasoning=true] - If true, includes explanation for the score in the output
|
|
20
|
+
* @param [options.fewShotExamples] - Optional list of example evaluations to append to the prompt
|
|
21
|
+
* @returns A function that evaluates agent trajectories using the configured LLM judge
|
|
22
|
+
*/
|
|
23
|
+
export declare const createGraphTrajectoryLLMAsJudge: ({ prompt, model, feedbackKey, judge, continuous, choices, useReasoning, fewShotExamples, }: TrajectoryLLMAsJudgeParams) => ({ inputs, outputs, referenceOutputs, ...extra }: {
|
|
24
|
+
[key: string]: unknown;
|
|
25
|
+
inputs: (string | Record<string, unknown> | null)[] | {
|
|
26
|
+
inputs: (string | Record<string, unknown> | null)[];
|
|
27
|
+
};
|
|
28
|
+
outputs: GraphTrajectory;
|
|
29
|
+
referenceOutputs?: GraphTrajectory | undefined;
|
|
30
|
+
}) => Promise<import("langsmith/vitest").SimpleEvaluationResult>;
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import { _createLLMAsJudgeScorer } from "openevals/llm";
|
|
2
|
+
import { _runEvaluator } from "../utils.js";
|
|
3
|
+
export const GRAPH_TRAJECTORY_ACCURACY_PROMPT = `You are an expert data labeler.
|
|
4
|
+
Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries.
|
|
5
|
+
|
|
6
|
+
<Rubric>
|
|
7
|
+
An accurate trajectory:
|
|
8
|
+
- Makes logical sense between steps
|
|
9
|
+
- Shows clear progression
|
|
10
|
+
- Is relatively efficient, though it does not need to be perfectly efficient
|
|
11
|
+
- Is semantically equivalent to the provided reference trajectory, if present
|
|
12
|
+
</Rubric>
|
|
13
|
+
|
|
14
|
+
<Instructions>
|
|
15
|
+
Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient.
|
|
16
|
+
For the trajectory, "__start__" denotes an initial entrypoint to the agent, and "__interrupt__" corresponds to the agent
|
|
17
|
+
interrupting to await additional data from another source ("human-in-the-loop").
|
|
18
|
+
|
|
19
|
+
Steps containing a colon represent steps within subagents (e.g. "graph:step_name").
|
|
20
|
+
</Instructions>
|
|
21
|
+
|
|
22
|
+
<thread>
|
|
23
|
+
{thread}
|
|
24
|
+
</thread>
|
|
25
|
+
|
|
26
|
+
{reference_outputs}
|
|
27
|
+
`;
|
|
28
|
+
function _formatThread(inputs, outputs) {
|
|
29
|
+
let formattedThread = "";
|
|
30
|
+
const zippedData = inputs.map((input, i) => ({
|
|
31
|
+
input: JSON.stringify(input ?? ""),
|
|
32
|
+
result: JSON.stringify(outputs.results[i]),
|
|
33
|
+
step: JSON.stringify(outputs.steps[i]),
|
|
34
|
+
}));
|
|
35
|
+
for (const { input, result, step } of zippedData) {
|
|
36
|
+
formattedThread += input ? `\n<input>\n${input}\n</input>\n` : "";
|
|
37
|
+
formattedThread += `\n<trajectory>\n${step}\n</trajectory>\n`;
|
|
38
|
+
formattedThread += `\n<result>\n${result}\n</result>\n`;
|
|
39
|
+
}
|
|
40
|
+
return formattedThread;
|
|
41
|
+
}
|
|
42
|
+
function _formatInputs(inputs, outputs, referenceOutputs) {
|
|
43
|
+
let processedInputs;
|
|
44
|
+
if (Array.isArray(inputs)) {
|
|
45
|
+
processedInputs = inputs;
|
|
46
|
+
}
|
|
47
|
+
else {
|
|
48
|
+
if (!("inputs" in inputs)) {
|
|
49
|
+
throw new Error("inputs must be an array or an object with an 'inputs' key");
|
|
50
|
+
}
|
|
51
|
+
processedInputs = inputs.inputs;
|
|
52
|
+
}
|
|
53
|
+
if (processedInputs.length !== outputs.results.length) {
|
|
54
|
+
throw new Error("Provided `inputs` and `results` within provided `outputs` must have the same length");
|
|
55
|
+
}
|
|
56
|
+
if (processedInputs.length !== outputs.steps.length) {
|
|
57
|
+
throw new Error("Provided `inputs` and `steps` within provided `outputs` must have the same length");
|
|
58
|
+
}
|
|
59
|
+
const formattedThread = _formatThread(processedInputs, outputs);
|
|
60
|
+
const formattedReferenceOutputs = referenceOutputs
|
|
61
|
+
? `\nUse the following trajectory as an example reference when grading:\n<reference_thread>\n${_formatThread(referenceOutputs.inputs ?? [], referenceOutputs)}\n</reference_thread>\n`
|
|
62
|
+
: "";
|
|
63
|
+
return {
|
|
64
|
+
formattedThread,
|
|
65
|
+
formattedReferenceOutputs,
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Creates an evaluator that uses an LLM to judge agent trajectories.
|
|
70
|
+
* @param options Configuration options
|
|
71
|
+
* @param [options.prompt] - The evaluation prompt. Can be a string template,
|
|
72
|
+
* LangChain prompt template, or callable that returns a list of chat messages. Note that the default prompt allows a rubric
|
|
73
|
+
* in addition to the typical "inputs", "outputs", and "reference_outputs" parameters.
|
|
74
|
+
* @param [options.feedbackKey="graph_trajectory_accuracy"] - Key used to store the evaluation result
|
|
75
|
+
* @param [options.judge] - The LLM used for evaluation. Can be an OpenAI client
|
|
76
|
+
* or a LangChain chat model. If an OpenAI client, must specify "model" as well.
|
|
77
|
+
* If omitted, "model" will be used to instantiate a LangChain model instance by model string.
|
|
78
|
+
* @param [options.model] - Model identifier to use. If "judge" is an OpenAI client,
|
|
79
|
+
* this argument should be a model name directly. If "judge" is omitted, must be a valid
|
|
80
|
+
* LangChain model identifier. See `init_chat_model` docs for more details:
|
|
81
|
+
* https://python.langchain.com/docs/how_to/chat_models_universal_init/
|
|
82
|
+
* @param [options.continuous=false] - If true, score will be a float between 0 and 1. If false, score will be boolean.
|
|
83
|
+
* @param [options.choices] - Optional list of specific float values the score must be chosen from
|
|
84
|
+
* @param [options.useReasoning=true] - If true, includes explanation for the score in the output
|
|
85
|
+
* @param [options.fewShotExamples] - Optional list of example evaluations to append to the prompt
|
|
86
|
+
* @returns A function that evaluates agent trajectories using the configured LLM judge
|
|
87
|
+
*/
|
|
88
|
+
export const createGraphTrajectoryLLMAsJudge = ({ prompt = GRAPH_TRAJECTORY_ACCURACY_PROMPT, model, feedbackKey = "graph_trajectory_accuracy", judge, continuous = false, choices, useReasoning = true, fewShotExamples, }) => {
|
|
89
|
+
const scorer = _createLLMAsJudgeScorer({
|
|
90
|
+
prompt,
|
|
91
|
+
judge,
|
|
92
|
+
model,
|
|
93
|
+
continuous,
|
|
94
|
+
choices,
|
|
95
|
+
useReasoning,
|
|
96
|
+
fewShotExamples,
|
|
97
|
+
});
|
|
98
|
+
const _wrappedEvaluator = async ({ inputs, outputs, referenceOutputs, ...extra }) => {
|
|
99
|
+
const { formattedThread, formattedReferenceOutputs } = _formatInputs(inputs, outputs, referenceOutputs);
|
|
100
|
+
return _runEvaluator(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, {
|
|
101
|
+
outputs,
|
|
102
|
+
inputs,
|
|
103
|
+
thread: formattedThread,
|
|
104
|
+
referenceOutputs: formattedReferenceOutputs,
|
|
105
|
+
...extra,
|
|
106
|
+
});
|
|
107
|
+
};
|
|
108
|
+
return _wrappedEvaluator;
|
|
109
|
+
};
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.graphTrajectoryStrictMatch = void 0;
|
|
4
|
+
const utils_js_1 = require("../utils.cjs");
|
|
5
|
+
const _scorer = (params) => {
|
|
6
|
+
const { outputs, referenceOutputs } = params;
|
|
7
|
+
if (!outputs || !referenceOutputs) {
|
|
8
|
+
throw new Error("Strict trajectory match requires both outputs and referenceOutputs");
|
|
9
|
+
}
|
|
10
|
+
if (outputs.steps.length !== referenceOutputs.steps.length) {
|
|
11
|
+
return false;
|
|
12
|
+
}
|
|
13
|
+
for (let i = 0; i < outputs.steps.length; i++) {
|
|
14
|
+
if (outputs.steps[i].length !== referenceOutputs.steps[i].length) {
|
|
15
|
+
return false;
|
|
16
|
+
}
|
|
17
|
+
for (let j = 0; j < outputs.steps[i].length; j++) {
|
|
18
|
+
if (outputs.steps[i][j] !== referenceOutputs.steps[i][j]) {
|
|
19
|
+
return false;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
return true;
|
|
24
|
+
};
|
|
25
|
+
/**
|
|
26
|
+
* Evaluate whether an input graph trajectory strictly matches a reference graph trajectory.
|
|
27
|
+
* This means that at each step, the agent took the same steps in the same order as specified in the reference trajectory.
|
|
28
|
+
*
|
|
29
|
+
* @param params - The parameters object
|
|
30
|
+
* @param params.outputs - Actual trajectory the agent followed
|
|
31
|
+
* @param params.referenceOutputs - Ideal reference trajectory the agent should have followed
|
|
32
|
+
* @returns Contains a score of true if trajectory (including called tools) matches, false otherwise
|
|
33
|
+
*/
|
|
34
|
+
const graphTrajectoryStrictMatch = ({ outputs, referenceOutputs, }) => {
|
|
35
|
+
return (0, utils_js_1._runEvaluator)("graph_trajectory_strict_match", _scorer, "graph_trajectory_strict_match", {
|
|
36
|
+
outputs,
|
|
37
|
+
referenceOutputs,
|
|
38
|
+
});
|
|
39
|
+
};
|
|
40
|
+
exports.graphTrajectoryStrictMatch = graphTrajectoryStrictMatch;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { GraphTrajectory } from "../types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Evaluate whether an input graph trajectory strictly matches a reference graph trajectory.
|
|
4
|
+
* This means that at each step, the agent took the same steps in the same order as specified in the reference trajectory.
|
|
5
|
+
*
|
|
6
|
+
* @param params - The parameters object
|
|
7
|
+
* @param params.outputs - Actual trajectory the agent followed
|
|
8
|
+
* @param params.referenceOutputs - Ideal reference trajectory the agent should have followed
|
|
9
|
+
* @returns Contains a score of true if trajectory (including called tools) matches, false otherwise
|
|
10
|
+
*/
|
|
11
|
+
export declare const graphTrajectoryStrictMatch: ({ outputs, referenceOutputs, }: {
|
|
12
|
+
outputs: GraphTrajectory;
|
|
13
|
+
referenceOutputs: GraphTrajectory;
|
|
14
|
+
}) => Promise<import("langsmith/vitest").SimpleEvaluationResult>;
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { _runEvaluator } from "../utils.js";
|
|
2
|
+
const _scorer = (params) => {
|
|
3
|
+
const { outputs, referenceOutputs } = params;
|
|
4
|
+
if (!outputs || !referenceOutputs) {
|
|
5
|
+
throw new Error("Strict trajectory match requires both outputs and referenceOutputs");
|
|
6
|
+
}
|
|
7
|
+
if (outputs.steps.length !== referenceOutputs.steps.length) {
|
|
8
|
+
return false;
|
|
9
|
+
}
|
|
10
|
+
for (let i = 0; i < outputs.steps.length; i++) {
|
|
11
|
+
if (outputs.steps[i].length !== referenceOutputs.steps[i].length) {
|
|
12
|
+
return false;
|
|
13
|
+
}
|
|
14
|
+
for (let j = 0; j < outputs.steps[i].length; j++) {
|
|
15
|
+
if (outputs.steps[i][j] !== referenceOutputs.steps[i][j]) {
|
|
16
|
+
return false;
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
return true;
|
|
21
|
+
};
|
|
22
|
+
/**
|
|
23
|
+
* Evaluate whether an input graph trajectory strictly matches a reference graph trajectory.
|
|
24
|
+
* This means that at each step, the agent took the same steps in the same order as specified in the reference trajectory.
|
|
25
|
+
*
|
|
26
|
+
* @param params - The parameters object
|
|
27
|
+
* @param params.outputs - Actual trajectory the agent followed
|
|
28
|
+
* @param params.referenceOutputs - Ideal reference trajectory the agent should have followed
|
|
29
|
+
* @returns Contains a score of true if trajectory (including called tools) matches, false otherwise
|
|
30
|
+
*/
|
|
31
|
+
export const graphTrajectoryStrictMatch = ({ outputs, referenceOutputs, }) => {
|
|
32
|
+
return _runEvaluator("graph_trajectory_strict_match", _scorer, "graph_trajectory_strict_match", {
|
|
33
|
+
outputs,
|
|
34
|
+
referenceOutputs,
|
|
35
|
+
});
|
|
36
|
+
};
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.extractLangGraphTrajectoryFromThread = exports._getLangGraphStateHistoryRecursive = exports.extractLangGraphTrajectoryFromSnapshots = void 0;
|
|
4
|
+
const messages_1 = require("@langchain/core/messages");
|
|
5
|
+
const openai_1 = require("@langchain/openai");
|
|
6
|
+
const extractLangGraphTrajectoryFromSnapshots = (snapshots) => {
|
|
7
|
+
const inputs = [];
|
|
8
|
+
const trajectory = {
|
|
9
|
+
results: [],
|
|
10
|
+
steps: [],
|
|
11
|
+
};
|
|
12
|
+
let isAccumulatingSteps = false;
|
|
13
|
+
for (let i = 0; i < snapshots.length; i += 1) {
|
|
14
|
+
const snapshot = snapshots[i];
|
|
15
|
+
const hasInterrupts = snapshot.tasks?.find((task) => {
|
|
16
|
+
return task.interrupts?.length;
|
|
17
|
+
});
|
|
18
|
+
if (!snapshot.next?.length || hasInterrupts) {
|
|
19
|
+
isAccumulatingSteps = true;
|
|
20
|
+
if (hasInterrupts) {
|
|
21
|
+
trajectory.results.push({});
|
|
22
|
+
}
|
|
23
|
+
else if (snapshot.values != null &&
|
|
24
|
+
typeof snapshot.values === "object" &&
|
|
25
|
+
!Array.isArray(snapshot.values) &&
|
|
26
|
+
"messages" in snapshot.values &&
|
|
27
|
+
Array.isArray(snapshot.values.messages)) {
|
|
28
|
+
const lastMessage = snapshot.values.messages.at(-1);
|
|
29
|
+
if ((0, messages_1.isBaseMessage)(lastMessage)) {
|
|
30
|
+
// Just append the last message in the output to the results to reduce context size
|
|
31
|
+
trajectory.results.push({
|
|
32
|
+
messages: (0, openai_1._convertMessagesToOpenAIParams)([lastMessage]),
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
else {
|
|
36
|
+
trajectory.results.push({ messages: [lastMessage] });
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
else {
|
|
40
|
+
trajectory.results.push(snapshot.values);
|
|
41
|
+
}
|
|
42
|
+
trajectory.steps.push([]);
|
|
43
|
+
}
|
|
44
|
+
if (isAccumulatingSteps && snapshot.tasks?.length) {
|
|
45
|
+
const checkpointNs = snapshot.config?.configurable?.checkpoint_ns ?? "";
|
|
46
|
+
let subgraphPath = "";
|
|
47
|
+
if (checkpointNs.split(":").length > 1) {
|
|
48
|
+
subgraphPath = `${checkpointNs.split(":")[0]}:`;
|
|
49
|
+
}
|
|
50
|
+
for (const task of snapshot.tasks) {
|
|
51
|
+
if (task.interrupts?.length) {
|
|
52
|
+
trajectory.steps.at(-1)?.push("__interrupt__");
|
|
53
|
+
}
|
|
54
|
+
trajectory.steps.at(-1)?.push(`${subgraphPath}${task.name}`);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
if (isAccumulatingSteps) {
|
|
58
|
+
if (snapshot.metadata != null && snapshot.metadata.source === "input") {
|
|
59
|
+
inputs.push(snapshot.metadata.writes);
|
|
60
|
+
}
|
|
61
|
+
else if (i + 1 < snapshots.length &&
|
|
62
|
+
snapshots[i + 1].tasks?.find((task) => task.interrupts?.length > 0)) {
|
|
63
|
+
inputs.push("__resuming__");
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
inputs.reverse();
|
|
68
|
+
trajectory.results.reverse();
|
|
69
|
+
trajectory.steps.reverse();
|
|
70
|
+
for (const stepList of trajectory.steps) {
|
|
71
|
+
stepList.reverse();
|
|
72
|
+
}
|
|
73
|
+
if (inputs.length !== trajectory.results.length) {
|
|
74
|
+
console.warn("Trajectory parsing may be incomplete: inputs and results have different lengths");
|
|
75
|
+
}
|
|
76
|
+
else if (inputs.length !== trajectory.steps.length) {
|
|
77
|
+
console.warn("Trajectory parsing may be incomplete: inputs and steps have different lengths");
|
|
78
|
+
}
|
|
79
|
+
return { inputs, outputs: trajectory };
|
|
80
|
+
};
|
|
81
|
+
exports.extractLangGraphTrajectoryFromSnapshots = extractLangGraphTrajectoryFromSnapshots;
|
|
82
|
+
const _getLangGraphStateHistoryRecursive = async (
|
|
83
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
84
|
+
graph, config) => {
|
|
85
|
+
const stateHistory = [];
|
|
86
|
+
for await (const history of graph.getStateHistory(config)) {
|
|
87
|
+
if (history.tasks?.length) {
|
|
88
|
+
for (const task of history.tasks) {
|
|
89
|
+
if (task.state?.configurable?.checkpoint_ns) {
|
|
90
|
+
stateHistory.push(...(await (0, exports._getLangGraphStateHistoryRecursive)(graph, task.state)));
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
stateHistory.push(history);
|
|
95
|
+
}
|
|
96
|
+
return stateHistory;
|
|
97
|
+
};
|
|
98
|
+
exports._getLangGraphStateHistoryRecursive = _getLangGraphStateHistoryRecursive;
|
|
99
|
+
const extractLangGraphTrajectoryFromThread = async (
|
|
100
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
101
|
+
graph, config) => {
|
|
102
|
+
const history = await (0, exports._getLangGraphStateHistoryRecursive)(graph, config);
|
|
103
|
+
return (0, exports.extractLangGraphTrajectoryFromSnapshots)(history);
|
|
104
|
+
};
|
|
105
|
+
exports.extractLangGraphTrajectoryFromThread = extractLangGraphTrajectoryFromThread;
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { StateSnapshot, Pregel } from "@langchain/langgraph/web";
|
|
2
|
+
import type { RunnableConfig } from "@langchain/core/runnables";
|
|
3
|
+
import type { GraphTrajectory } from "../types.js";
|
|
4
|
+
export declare const extractLangGraphTrajectoryFromSnapshots: (snapshots: StateSnapshot[]) => {
|
|
5
|
+
inputs: (string | Record<string, unknown> | null)[];
|
|
6
|
+
outputs: GraphTrajectory;
|
|
7
|
+
};
|
|
8
|
+
export declare const _getLangGraphStateHistoryRecursive: (graph: Pregel<any, any>, config: RunnableConfig) => Promise<StateSnapshot[]>;
|
|
9
|
+
export declare const extractLangGraphTrajectoryFromThread: (graph: Pregel<any, any>, config: RunnableConfig) => Promise<{
|
|
10
|
+
inputs: (string | Record<string, unknown> | null)[];
|
|
11
|
+
outputs: GraphTrajectory;
|
|
12
|
+
}>;
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import { isBaseMessage } from "@langchain/core/messages";
|
|
2
|
+
import { _convertMessagesToOpenAIParams } from "@langchain/openai";
|
|
3
|
+
export const extractLangGraphTrajectoryFromSnapshots = (snapshots) => {
|
|
4
|
+
const inputs = [];
|
|
5
|
+
const trajectory = {
|
|
6
|
+
results: [],
|
|
7
|
+
steps: [],
|
|
8
|
+
};
|
|
9
|
+
let isAccumulatingSteps = false;
|
|
10
|
+
for (let i = 0; i < snapshots.length; i += 1) {
|
|
11
|
+
const snapshot = snapshots[i];
|
|
12
|
+
const hasInterrupts = snapshot.tasks?.find((task) => {
|
|
13
|
+
return task.interrupts?.length;
|
|
14
|
+
});
|
|
15
|
+
if (!snapshot.next?.length || hasInterrupts) {
|
|
16
|
+
isAccumulatingSteps = true;
|
|
17
|
+
if (hasInterrupts) {
|
|
18
|
+
trajectory.results.push({});
|
|
19
|
+
}
|
|
20
|
+
else if (snapshot.values != null &&
|
|
21
|
+
typeof snapshot.values === "object" &&
|
|
22
|
+
!Array.isArray(snapshot.values) &&
|
|
23
|
+
"messages" in snapshot.values &&
|
|
24
|
+
Array.isArray(snapshot.values.messages)) {
|
|
25
|
+
const lastMessage = snapshot.values.messages.at(-1);
|
|
26
|
+
if (isBaseMessage(lastMessage)) {
|
|
27
|
+
// Just append the last message in the output to the results to reduce context size
|
|
28
|
+
trajectory.results.push({
|
|
29
|
+
messages: _convertMessagesToOpenAIParams([lastMessage]),
|
|
30
|
+
});
|
|
31
|
+
}
|
|
32
|
+
else {
|
|
33
|
+
trajectory.results.push({ messages: [lastMessage] });
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
trajectory.results.push(snapshot.values);
|
|
38
|
+
}
|
|
39
|
+
trajectory.steps.push([]);
|
|
40
|
+
}
|
|
41
|
+
if (isAccumulatingSteps && snapshot.tasks?.length) {
|
|
42
|
+
const checkpointNs = snapshot.config?.configurable?.checkpoint_ns ?? "";
|
|
43
|
+
let subgraphPath = "";
|
|
44
|
+
if (checkpointNs.split(":").length > 1) {
|
|
45
|
+
subgraphPath = `${checkpointNs.split(":")[0]}:`;
|
|
46
|
+
}
|
|
47
|
+
for (const task of snapshot.tasks) {
|
|
48
|
+
if (task.interrupts?.length) {
|
|
49
|
+
trajectory.steps.at(-1)?.push("__interrupt__");
|
|
50
|
+
}
|
|
51
|
+
trajectory.steps.at(-1)?.push(`${subgraphPath}${task.name}`);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
if (isAccumulatingSteps) {
|
|
55
|
+
if (snapshot.metadata != null && snapshot.metadata.source === "input") {
|
|
56
|
+
inputs.push(snapshot.metadata.writes);
|
|
57
|
+
}
|
|
58
|
+
else if (i + 1 < snapshots.length &&
|
|
59
|
+
snapshots[i + 1].tasks?.find((task) => task.interrupts?.length > 0)) {
|
|
60
|
+
inputs.push("__resuming__");
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
inputs.reverse();
|
|
65
|
+
trajectory.results.reverse();
|
|
66
|
+
trajectory.steps.reverse();
|
|
67
|
+
for (const stepList of trajectory.steps) {
|
|
68
|
+
stepList.reverse();
|
|
69
|
+
}
|
|
70
|
+
if (inputs.length !== trajectory.results.length) {
|
|
71
|
+
console.warn("Trajectory parsing may be incomplete: inputs and results have different lengths");
|
|
72
|
+
}
|
|
73
|
+
else if (inputs.length !== trajectory.steps.length) {
|
|
74
|
+
console.warn("Trajectory parsing may be incomplete: inputs and steps have different lengths");
|
|
75
|
+
}
|
|
76
|
+
return { inputs, outputs: trajectory };
|
|
77
|
+
};
|
|
78
|
+
export const _getLangGraphStateHistoryRecursive = async (
|
|
79
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
80
|
+
graph, config) => {
|
|
81
|
+
const stateHistory = [];
|
|
82
|
+
for await (const history of graph.getStateHistory(config)) {
|
|
83
|
+
if (history.tasks?.length) {
|
|
84
|
+
for (const task of history.tasks) {
|
|
85
|
+
if (task.state?.configurable?.checkpoint_ns) {
|
|
86
|
+
stateHistory.push(...(await _getLangGraphStateHistoryRecursive(graph, task.state)));
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
stateHistory.push(history);
|
|
91
|
+
}
|
|
92
|
+
return stateHistory;
|
|
93
|
+
};
|
|
94
|
+
export const extractLangGraphTrajectoryFromThread = async (
|
|
95
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
96
|
+
graph, config) => {
|
|
97
|
+
const history = await _getLangGraphStateHistoryRecursive(graph, config);
|
|
98
|
+
return extractLangGraphTrajectoryFromSnapshots(history);
|
|
99
|
+
};
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
exports.GRAPH_TRAJECTORY_ACCURACY_PROMPT = exports.createGraphTrajectoryLLMAsJudge = exports.TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = exports.TRAJECTORY_ACCURACY_PROMPT = exports.createTrajectoryLLMAsJudge = exports.trajectoryUnorderedMatch = exports.trajectorySuperset = exports.trajectorySubset = exports.trajectoryStrictMatch = void 0;
|
|
18
|
+
var strict_js_1 = require("./trajectory/strict.cjs");
|
|
19
|
+
Object.defineProperty(exports, "trajectoryStrictMatch", { enumerable: true, get: function () { return strict_js_1.trajectoryStrictMatch; } });
|
|
20
|
+
var subset_js_1 = require("./trajectory/subset.cjs");
|
|
21
|
+
Object.defineProperty(exports, "trajectorySubset", { enumerable: true, get: function () { return subset_js_1.trajectorySubset; } });
|
|
22
|
+
var superset_js_1 = require("./trajectory/superset.cjs");
|
|
23
|
+
Object.defineProperty(exports, "trajectorySuperset", { enumerable: true, get: function () { return superset_js_1.trajectorySuperset; } });
|
|
24
|
+
var unordered_js_1 = require("./trajectory/unordered.cjs");
|
|
25
|
+
Object.defineProperty(exports, "trajectoryUnorderedMatch", { enumerable: true, get: function () { return unordered_js_1.trajectoryUnorderedMatch; } });
|
|
26
|
+
var llm_js_1 = require("./trajectory/llm.cjs");
|
|
27
|
+
Object.defineProperty(exports, "createTrajectoryLLMAsJudge", { enumerable: true, get: function () { return llm_js_1.createTrajectoryLLMAsJudge; } });
|
|
28
|
+
Object.defineProperty(exports, "TRAJECTORY_ACCURACY_PROMPT", { enumerable: true, get: function () { return llm_js_1.TRAJECTORY_ACCURACY_PROMPT; } });
|
|
29
|
+
Object.defineProperty(exports, "TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE", { enumerable: true, get: function () { return llm_js_1.TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE; } });
|
|
30
|
+
var llm_js_2 = require("./graph_trajectory/llm.cjs");
|
|
31
|
+
Object.defineProperty(exports, "createGraphTrajectoryLLMAsJudge", { enumerable: true, get: function () { return llm_js_2.createGraphTrajectoryLLMAsJudge; } });
|
|
32
|
+
Object.defineProperty(exports, "GRAPH_TRAJECTORY_ACCURACY_PROMPT", { enumerable: true, get: function () { return llm_js_2.GRAPH_TRAJECTORY_ACCURACY_PROMPT; } });
|
|
33
|
+
__exportStar(require("./types.cjs"), exports);
|
|
34
|
+
__exportStar(require("./utils.cjs"), exports);
|
|
35
|
+
__exportStar(require("./graph_trajectory/utils.cjs"), exports);
|
package/dist/index.d.ts
CHANGED
|
@@ -2,5 +2,8 @@ export { trajectoryStrictMatch } from "./trajectory/strict.js";
|
|
|
2
2
|
export { trajectorySubset } from "./trajectory/subset.js";
|
|
3
3
|
export { trajectorySuperset } from "./trajectory/superset.js";
|
|
4
4
|
export { trajectoryUnorderedMatch } from "./trajectory/unordered.js";
|
|
5
|
-
export { createTrajectoryLLMAsJudge,
|
|
5
|
+
export { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, } from "./trajectory/llm.js";
|
|
6
|
+
export { createGraphTrajectoryLLMAsJudge, GRAPH_TRAJECTORY_ACCURACY_PROMPT, } from "./graph_trajectory/llm.js";
|
|
6
7
|
export * from "./types.js";
|
|
8
|
+
export * from "./utils.js";
|
|
9
|
+
export * from "./graph_trajectory/utils.js";
|
package/dist/index.js
CHANGED
|
@@ -2,5 +2,8 @@ export { trajectoryStrictMatch } from "./trajectory/strict.js";
|
|
|
2
2
|
export { trajectorySubset } from "./trajectory/subset.js";
|
|
3
3
|
export { trajectorySuperset } from "./trajectory/superset.js";
|
|
4
4
|
export { trajectoryUnorderedMatch } from "./trajectory/unordered.js";
|
|
5
|
-
export { createTrajectoryLLMAsJudge,
|
|
5
|
+
export { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, } from "./trajectory/llm.js";
|
|
6
|
+
export { createGraphTrajectoryLLMAsJudge, GRAPH_TRAJECTORY_ACCURACY_PROMPT, } from "./graph_trajectory/llm.js";
|
|
6
7
|
export * from "./types.js";
|
|
8
|
+
export * from "./utils.js";
|
|
9
|
+
export * from "./graph_trajectory/utils.js";
|