agentevals 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/dist/graph_trajectory/llm.cjs +113 -0
  2. package/dist/graph_trajectory/llm.d.ts +30 -0
  3. package/dist/graph_trajectory/llm.js +109 -0
  4. package/dist/graph_trajectory/strict.cjs +40 -0
  5. package/dist/graph_trajectory/strict.d.ts +14 -0
  6. package/dist/graph_trajectory/strict.js +36 -0
  7. package/dist/graph_trajectory/utils.cjs +105 -0
  8. package/dist/graph_trajectory/utils.d.ts +12 -0
  9. package/dist/graph_trajectory/utils.js +99 -0
  10. package/dist/index.cjs +35 -0
  11. package/dist/index.d.ts +4 -1
  12. package/dist/index.js +4 -1
  13. package/dist/trajectory/llm.cjs +117 -0
  14. package/dist/trajectory/llm.d.ts +5 -18
  15. package/dist/trajectory/llm.js +36 -10
  16. package/dist/trajectory/strict.cjs +74 -0
  17. package/dist/trajectory/strict.d.ts +2 -0
  18. package/dist/trajectory/strict.js +18 -2
  19. package/dist/trajectory/subset.cjs +32 -0
  20. package/dist/trajectory/superset.cjs +32 -0
  21. package/dist/trajectory/unordered.cjs +33 -0
  22. package/dist/trajectory/utils.cjs +69 -0
  23. package/dist/trajectory/utils.js +1 -0
  24. package/dist/types.cjs +17 -0
  25. package/dist/types.d.ts +11 -41
  26. package/dist/types.js +1 -1
  27. package/dist/utils.cjs +56 -0
  28. package/dist/utils.d.ts +2 -3
  29. package/dist/utils.js +3 -35
  30. package/index.cjs +1 -0
  31. package/index.d.ts +1 -0
  32. package/index.js +1 -0
  33. package/package.json +2 -2
  34. package/dist/trajectory/tests/trajectory.test.d.ts +0 -1
  35. package/dist/trajectory/tests/trajectory.test.js +0 -724
  36. package/dist/trajectory/tests/trajectory_llm.test.d.ts +0 -1
  37. package/dist/trajectory/tests/trajectory_llm.test.js +0 -110
@@ -0,0 +1,113 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.createGraphTrajectoryLLMAsJudge = exports.GRAPH_TRAJECTORY_ACCURACY_PROMPT = void 0;
4
+ const llm_1 = require("openevals/llm");
5
+ const utils_js_1 = require("../utils.cjs");
6
+ exports.GRAPH_TRAJECTORY_ACCURACY_PROMPT = `You are an expert data labeler.
7
+ Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries.
8
+
9
+ <Rubric>
10
+ An accurate trajectory:
11
+ - Makes logical sense between steps
12
+ - Shows clear progression
13
+ - Is relatively efficient, though it does not need to be perfectly efficient
14
+ - Is semantically equivalent to the provided reference trajectory, if present
15
+ </Rubric>
16
+
17
+ <Instructions>
18
+ Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient.
19
+ For the trajectory, "__start__" denotes an initial entrypoint to the agent, and "__interrupt__" corresponds to the agent
20
+ interrupting to await additional data from another source ("human-in-the-loop").
21
+
22
+ Steps containing a colon represent steps within subagents (e.g. "graph:step_name").
23
+ </Instructions>
24
+
25
+ <thread>
26
+ {thread}
27
+ </thread>
28
+
29
+ {reference_outputs}
30
+ `;
31
+ function _formatThread(inputs, outputs) {
32
+ let formattedThread = "";
33
+ const zippedData = inputs.map((input, i) => ({
34
+ input: JSON.stringify(input ?? ""),
35
+ result: JSON.stringify(outputs.results[i]),
36
+ step: JSON.stringify(outputs.steps[i]),
37
+ }));
38
+ for (const { input, result, step } of zippedData) {
39
+ formattedThread += input ? `\n<input>\n${input}\n</input>\n` : "";
40
+ formattedThread += `\n<trajectory>\n${step}\n</trajectory>\n`;
41
+ formattedThread += `\n<result>\n${result}\n</result>\n`;
42
+ }
43
+ return formattedThread;
44
+ }
45
+ function _formatInputs(inputs, outputs, referenceOutputs) {
46
+ let processedInputs;
47
+ if (Array.isArray(inputs)) {
48
+ processedInputs = inputs;
49
+ }
50
+ else {
51
+ if (!("inputs" in inputs)) {
52
+ throw new Error("inputs must be an array or an object with an 'inputs' key");
53
+ }
54
+ processedInputs = inputs.inputs;
55
+ }
56
+ if (processedInputs.length !== outputs.results.length) {
57
+ throw new Error("Provided `inputs` and `results` within provided `outputs` must have the same length");
58
+ }
59
+ if (processedInputs.length !== outputs.steps.length) {
60
+ throw new Error("Provided `inputs` and `steps` within provided `outputs` must have the same length");
61
+ }
62
+ const formattedThread = _formatThread(processedInputs, outputs);
63
+ const formattedReferenceOutputs = referenceOutputs
64
+ ? `\nUse the following trajectory as an example reference when grading:\n<reference_thread>\n${_formatThread(referenceOutputs.inputs ?? [], referenceOutputs)}\n</reference_thread>\n`
65
+ : "";
66
+ return {
67
+ formattedThread,
68
+ formattedReferenceOutputs,
69
+ };
70
+ }
71
+ /**
72
+ * Creates an evaluator that uses an LLM to judge agent trajectories.
73
+ * @param options Configuration options
74
+ * @param [options.prompt] - The evaluation prompt. Can be a string template,
75
+ * LangChain prompt template, or callable that returns a list of chat messages. Note that the default prompt allows a rubric
76
+ * in addition to the typical "inputs", "outputs", and "reference_outputs" parameters.
77
+ * @param [options.feedbackKey="graph_trajectory_accuracy"] - Key used to store the evaluation result
78
+ * @param [options.judge] - The LLM used for evaluation. Can be an OpenAI client
79
+ * or a LangChain chat model. If an OpenAI client, must specify "model" as well.
80
+ * If omitted, "model" will be used to instantiate a LangChain model instance by model string.
81
+ * @param [options.model] - Model identifier to use. If "judge" is an OpenAI client,
82
+ * this argument should be a model name directly. If "judge" is omitted, must be a valid
83
+ * LangChain model identifier. See `init_chat_model` docs for more details:
84
+ * https://python.langchain.com/docs/how_to/chat_models_universal_init/
85
+ * @param [options.continuous=false] - If true, score will be a float between 0 and 1. If false, score will be boolean.
86
+ * @param [options.choices] - Optional list of specific float values the score must be chosen from
87
+ * @param [options.useReasoning=true] - If true, includes explanation for the score in the output
88
+ * @param [options.fewShotExamples] - Optional list of example evaluations to append to the prompt
89
+ * @returns A function that evaluates agent trajectories using the configured LLM judge
90
+ */
91
+ const createGraphTrajectoryLLMAsJudge = ({ prompt = exports.GRAPH_TRAJECTORY_ACCURACY_PROMPT, model, feedbackKey = "graph_trajectory_accuracy", judge, continuous = false, choices, useReasoning = true, fewShotExamples, }) => {
92
+ const scorer = (0, llm_1._createLLMAsJudgeScorer)({
93
+ prompt,
94
+ judge,
95
+ model,
96
+ continuous,
97
+ choices,
98
+ useReasoning,
99
+ fewShotExamples,
100
+ });
101
+ const _wrappedEvaluator = async ({ inputs, outputs, referenceOutputs, ...extra }) => {
102
+ const { formattedThread, formattedReferenceOutputs } = _formatInputs(inputs, outputs, referenceOutputs);
103
+ return (0, utils_js_1._runEvaluator)(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, {
104
+ outputs,
105
+ inputs,
106
+ thread: formattedThread,
107
+ referenceOutputs: formattedReferenceOutputs,
108
+ ...extra,
109
+ });
110
+ };
111
+ return _wrappedEvaluator;
112
+ };
113
+ exports.createGraphTrajectoryLLMAsJudge = createGraphTrajectoryLLMAsJudge;
@@ -0,0 +1,30 @@
1
+ import type { GraphTrajectory, TrajectoryLLMAsJudgeParams } from "../types.js";
2
+ export declare const GRAPH_TRAJECTORY_ACCURACY_PROMPT = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal steps in resolving a user queries.\n\n<Rubric>\n An accurate trajectory:\n - Makes logical sense between steps\n - Shows clear progression\n - Is relatively efficient, though it does not need to be perfectly efficient\n - Is semantically equivalent to the provided reference trajectory, if present\n</Rubric>\n\n<Instructions>\n Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient.\n For the trajectory, \"__start__\" denotes an initial entrypoint to the agent, and \"__interrupt__\" corresponds to the agent\n interrupting to await additional data from another source (\"human-in-the-loop\").\n \n Steps containing a colon represent steps within subagents (e.g. \"graph:step_name\").\n</Instructions>\n\n<thread>\n{thread}\n</thread>\n\n{reference_outputs}\n";
3
+ /**
4
+ * Creates an evaluator that uses an LLM to judge agent trajectories.
5
+ * @param options Configuration options
6
+ * @param [options.prompt] - The evaluation prompt. Can be a string template,
7
+ * LangChain prompt template, or callable that returns a list of chat messages. Note that the default prompt allows a rubric
8
+ * in addition to the typical "inputs", "outputs", and "reference_outputs" parameters.
9
+ * @param [options.feedbackKey="graph_trajectory_accuracy"] - Key used to store the evaluation result
10
+ * @param [options.judge] - The LLM used for evaluation. Can be an OpenAI client
11
+ * or a LangChain chat model. If an OpenAI client, must specify "model" as well.
12
+ * If omitted, "model" will be used to instantiate a LangChain model instance by model string.
13
+ * @param [options.model] - Model identifier to use. If "judge" is an OpenAI client,
14
+ * this argument should be a model name directly. If "judge" is omitted, must be a valid
15
+ * LangChain model identifier. See `init_chat_model` docs for more details:
16
+ * https://python.langchain.com/docs/how_to/chat_models_universal_init/
17
+ * @param [options.continuous=false] - If true, score will be a float between 0 and 1. If false, score will be boolean.
18
+ * @param [options.choices] - Optional list of specific float values the score must be chosen from
19
+ * @param [options.useReasoning=true] - If true, includes explanation for the score in the output
20
+ * @param [options.fewShotExamples] - Optional list of example evaluations to append to the prompt
21
+ * @returns A function that evaluates agent trajectories using the configured LLM judge
22
+ */
23
+ export declare const createGraphTrajectoryLLMAsJudge: ({ prompt, model, feedbackKey, judge, continuous, choices, useReasoning, fewShotExamples, }: TrajectoryLLMAsJudgeParams) => ({ inputs, outputs, referenceOutputs, ...extra }: {
24
+ [key: string]: unknown;
25
+ inputs: (string | Record<string, unknown> | null)[] | {
26
+ inputs: (string | Record<string, unknown> | null)[];
27
+ };
28
+ outputs: GraphTrajectory;
29
+ referenceOutputs?: GraphTrajectory | undefined;
30
+ }) => Promise<import("langsmith/vitest").SimpleEvaluationResult>;
@@ -0,0 +1,109 @@
1
+ import { _createLLMAsJudgeScorer } from "openevals/llm";
2
+ import { _runEvaluator } from "../utils.js";
3
+ export const GRAPH_TRAJECTORY_ACCURACY_PROMPT = `You are an expert data labeler.
4
+ Your task is to grade the accuracy of an AI agent's internal steps in resolving a user queries.
5
+
6
+ <Rubric>
7
+ An accurate trajectory:
8
+ - Makes logical sense between steps
9
+ - Shows clear progression
10
+ - Is relatively efficient, though it does not need to be perfectly efficient
11
+ - Is semantically equivalent to the provided reference trajectory, if present
12
+ </Rubric>
13
+
14
+ <Instructions>
15
+ Grade the following thread, evaluating whether the agent's overall steps are logical and relatively efficient.
16
+ For the trajectory, "__start__" denotes an initial entrypoint to the agent, and "__interrupt__" corresponds to the agent
17
+ interrupting to await additional data from another source ("human-in-the-loop").
18
+
19
+ Steps containing a colon represent steps within subagents (e.g. "graph:step_name").
20
+ </Instructions>
21
+
22
+ <thread>
23
+ {thread}
24
+ </thread>
25
+
26
+ {reference_outputs}
27
+ `;
28
+ function _formatThread(inputs, outputs) {
29
+ let formattedThread = "";
30
+ const zippedData = inputs.map((input, i) => ({
31
+ input: JSON.stringify(input ?? ""),
32
+ result: JSON.stringify(outputs.results[i]),
33
+ step: JSON.stringify(outputs.steps[i]),
34
+ }));
35
+ for (const { input, result, step } of zippedData) {
36
+ formattedThread += input ? `\n<input>\n${input}\n</input>\n` : "";
37
+ formattedThread += `\n<trajectory>\n${step}\n</trajectory>\n`;
38
+ formattedThread += `\n<result>\n${result}\n</result>\n`;
39
+ }
40
+ return formattedThread;
41
+ }
42
+ function _formatInputs(inputs, outputs, referenceOutputs) {
43
+ let processedInputs;
44
+ if (Array.isArray(inputs)) {
45
+ processedInputs = inputs;
46
+ }
47
+ else {
48
+ if (!("inputs" in inputs)) {
49
+ throw new Error("inputs must be an array or an object with an 'inputs' key");
50
+ }
51
+ processedInputs = inputs.inputs;
52
+ }
53
+ if (processedInputs.length !== outputs.results.length) {
54
+ throw new Error("Provided `inputs` and `results` within provided `outputs` must have the same length");
55
+ }
56
+ if (processedInputs.length !== outputs.steps.length) {
57
+ throw new Error("Provided `inputs` and `steps` within provided `outputs` must have the same length");
58
+ }
59
+ const formattedThread = _formatThread(processedInputs, outputs);
60
+ const formattedReferenceOutputs = referenceOutputs
61
+ ? `\nUse the following trajectory as an example reference when grading:\n<reference_thread>\n${_formatThread(referenceOutputs.inputs ?? [], referenceOutputs)}\n</reference_thread>\n`
62
+ : "";
63
+ return {
64
+ formattedThread,
65
+ formattedReferenceOutputs,
66
+ };
67
+ }
68
+ /**
69
+ * Creates an evaluator that uses an LLM to judge agent trajectories.
70
+ * @param options Configuration options
71
+ * @param [options.prompt] - The evaluation prompt. Can be a string template,
72
+ * LangChain prompt template, or callable that returns a list of chat messages. Note that the default prompt allows a rubric
73
+ * in addition to the typical "inputs", "outputs", and "reference_outputs" parameters.
74
+ * @param [options.feedbackKey="graph_trajectory_accuracy"] - Key used to store the evaluation result
75
+ * @param [options.judge] - The LLM used for evaluation. Can be an OpenAI client
76
+ * or a LangChain chat model. If an OpenAI client, must specify "model" as well.
77
+ * If omitted, "model" will be used to instantiate a LangChain model instance by model string.
78
+ * @param [options.model] - Model identifier to use. If "judge" is an OpenAI client,
79
+ * this argument should be a model name directly. If "judge" is omitted, must be a valid
80
+ * LangChain model identifier. See `init_chat_model` docs for more details:
81
+ * https://python.langchain.com/docs/how_to/chat_models_universal_init/
82
+ * @param [options.continuous=false] - If true, score will be a float between 0 and 1. If false, score will be boolean.
83
+ * @param [options.choices] - Optional list of specific float values the score must be chosen from
84
+ * @param [options.useReasoning=true] - If true, includes explanation for the score in the output
85
+ * @param [options.fewShotExamples] - Optional list of example evaluations to append to the prompt
86
+ * @returns A function that evaluates agent trajectories using the configured LLM judge
87
+ */
88
+ export const createGraphTrajectoryLLMAsJudge = ({ prompt = GRAPH_TRAJECTORY_ACCURACY_PROMPT, model, feedbackKey = "graph_trajectory_accuracy", judge, continuous = false, choices, useReasoning = true, fewShotExamples, }) => {
89
+ const scorer = _createLLMAsJudgeScorer({
90
+ prompt,
91
+ judge,
92
+ model,
93
+ continuous,
94
+ choices,
95
+ useReasoning,
96
+ fewShotExamples,
97
+ });
98
+ const _wrappedEvaluator = async ({ inputs, outputs, referenceOutputs, ...extra }) => {
99
+ const { formattedThread, formattedReferenceOutputs } = _formatInputs(inputs, outputs, referenceOutputs);
100
+ return _runEvaluator(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, {
101
+ outputs,
102
+ inputs,
103
+ thread: formattedThread,
104
+ referenceOutputs: formattedReferenceOutputs,
105
+ ...extra,
106
+ });
107
+ };
108
+ return _wrappedEvaluator;
109
+ };
@@ -0,0 +1,40 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.graphTrajectoryStrictMatch = void 0;
4
+ const utils_js_1 = require("../utils.cjs");
5
+ const _scorer = (params) => {
6
+ const { outputs, referenceOutputs } = params;
7
+ if (!outputs || !referenceOutputs) {
8
+ throw new Error("Strict trajectory match requires both outputs and referenceOutputs");
9
+ }
10
+ if (outputs.steps.length !== referenceOutputs.steps.length) {
11
+ return false;
12
+ }
13
+ for (let i = 0; i < outputs.steps.length; i++) {
14
+ if (outputs.steps[i].length !== referenceOutputs.steps[i].length) {
15
+ return false;
16
+ }
17
+ for (let j = 0; j < outputs.steps[i].length; j++) {
18
+ if (outputs.steps[i][j] !== referenceOutputs.steps[i][j]) {
19
+ return false;
20
+ }
21
+ }
22
+ }
23
+ return true;
24
+ };
25
+ /**
26
+ * Evaluate whether an input graph trajectory strictly matches a reference graph trajectory.
27
+ * This means that at each step, the agent took the same steps in the same order as specified in the reference trajectory.
28
+ *
29
+ * @param params - The parameters object
30
+ * @param params.outputs - Actual trajectory the agent followed
31
+ * @param params.referenceOutputs - Ideal reference trajectory the agent should have followed
32
+ * @returns Contains a score of true if trajectory (including called tools) matches, false otherwise
33
+ */
34
+ const graphTrajectoryStrictMatch = ({ outputs, referenceOutputs, }) => {
35
+ return (0, utils_js_1._runEvaluator)("graph_trajectory_strict_match", _scorer, "graph_trajectory_strict_match", {
36
+ outputs,
37
+ referenceOutputs,
38
+ });
39
+ };
40
+ exports.graphTrajectoryStrictMatch = graphTrajectoryStrictMatch;
@@ -0,0 +1,14 @@
1
+ import { GraphTrajectory } from "../types.js";
2
+ /**
3
+ * Evaluate whether an input graph trajectory strictly matches a reference graph trajectory.
4
+ * This means that at each step, the agent took the same steps in the same order as specified in the reference trajectory.
5
+ *
6
+ * @param params - The parameters object
7
+ * @param params.outputs - Actual trajectory the agent followed
8
+ * @param params.referenceOutputs - Ideal reference trajectory the agent should have followed
9
+ * @returns Contains a score of true if trajectory (including called tools) matches, false otherwise
10
+ */
11
+ export declare const graphTrajectoryStrictMatch: ({ outputs, referenceOutputs, }: {
12
+ outputs: GraphTrajectory;
13
+ referenceOutputs: GraphTrajectory;
14
+ }) => Promise<import("langsmith/vitest").SimpleEvaluationResult>;
@@ -0,0 +1,36 @@
1
+ import { _runEvaluator } from "../utils.js";
2
+ const _scorer = (params) => {
3
+ const { outputs, referenceOutputs } = params;
4
+ if (!outputs || !referenceOutputs) {
5
+ throw new Error("Strict trajectory match requires both outputs and referenceOutputs");
6
+ }
7
+ if (outputs.steps.length !== referenceOutputs.steps.length) {
8
+ return false;
9
+ }
10
+ for (let i = 0; i < outputs.steps.length; i++) {
11
+ if (outputs.steps[i].length !== referenceOutputs.steps[i].length) {
12
+ return false;
13
+ }
14
+ for (let j = 0; j < outputs.steps[i].length; j++) {
15
+ if (outputs.steps[i][j] !== referenceOutputs.steps[i][j]) {
16
+ return false;
17
+ }
18
+ }
19
+ }
20
+ return true;
21
+ };
22
+ /**
23
+ * Evaluate whether an input graph trajectory strictly matches a reference graph trajectory.
24
+ * This means that at each step, the agent took the same steps in the same order as specified in the reference trajectory.
25
+ *
26
+ * @param params - The parameters object
27
+ * @param params.outputs - Actual trajectory the agent followed
28
+ * @param params.referenceOutputs - Ideal reference trajectory the agent should have followed
29
+ * @returns Contains a score of true if trajectory (including called tools) matches, false otherwise
30
+ */
31
+ export const graphTrajectoryStrictMatch = ({ outputs, referenceOutputs, }) => {
32
+ return _runEvaluator("graph_trajectory_strict_match", _scorer, "graph_trajectory_strict_match", {
33
+ outputs,
34
+ referenceOutputs,
35
+ });
36
+ };
@@ -0,0 +1,105 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.extractLangGraphTrajectoryFromThread = exports._getLangGraphStateHistoryRecursive = exports.extractLangGraphTrajectoryFromSnapshots = void 0;
4
+ const messages_1 = require("@langchain/core/messages");
5
+ const openai_1 = require("@langchain/openai");
6
+ const extractLangGraphTrajectoryFromSnapshots = (snapshots) => {
7
+ const inputs = [];
8
+ const trajectory = {
9
+ results: [],
10
+ steps: [],
11
+ };
12
+ let isAccumulatingSteps = false;
13
+ for (let i = 0; i < snapshots.length; i += 1) {
14
+ const snapshot = snapshots[i];
15
+ const hasInterrupts = snapshot.tasks?.find((task) => {
16
+ return task.interrupts?.length;
17
+ });
18
+ if (!snapshot.next?.length || hasInterrupts) {
19
+ isAccumulatingSteps = true;
20
+ if (hasInterrupts) {
21
+ trajectory.results.push({});
22
+ }
23
+ else if (snapshot.values != null &&
24
+ typeof snapshot.values === "object" &&
25
+ !Array.isArray(snapshot.values) &&
26
+ "messages" in snapshot.values &&
27
+ Array.isArray(snapshot.values.messages)) {
28
+ const lastMessage = snapshot.values.messages.at(-1);
29
+ if ((0, messages_1.isBaseMessage)(lastMessage)) {
30
+ // Just append the last message in the output to the results to reduce context size
31
+ trajectory.results.push({
32
+ messages: (0, openai_1._convertMessagesToOpenAIParams)([lastMessage]),
33
+ });
34
+ }
35
+ else {
36
+ trajectory.results.push({ messages: [lastMessage] });
37
+ }
38
+ }
39
+ else {
40
+ trajectory.results.push(snapshot.values);
41
+ }
42
+ trajectory.steps.push([]);
43
+ }
44
+ if (isAccumulatingSteps && snapshot.tasks?.length) {
45
+ const checkpointNs = snapshot.config?.configurable?.checkpoint_ns ?? "";
46
+ let subgraphPath = "";
47
+ if (checkpointNs.split(":").length > 1) {
48
+ subgraphPath = `${checkpointNs.split(":")[0]}:`;
49
+ }
50
+ for (const task of snapshot.tasks) {
51
+ if (task.interrupts?.length) {
52
+ trajectory.steps.at(-1)?.push("__interrupt__");
53
+ }
54
+ trajectory.steps.at(-1)?.push(`${subgraphPath}${task.name}`);
55
+ }
56
+ }
57
+ if (isAccumulatingSteps) {
58
+ if (snapshot.metadata != null && snapshot.metadata.source === "input") {
59
+ inputs.push(snapshot.metadata.writes);
60
+ }
61
+ else if (i + 1 < snapshots.length &&
62
+ snapshots[i + 1].tasks?.find((task) => task.interrupts?.length > 0)) {
63
+ inputs.push("__resuming__");
64
+ }
65
+ }
66
+ }
67
+ inputs.reverse();
68
+ trajectory.results.reverse();
69
+ trajectory.steps.reverse();
70
+ for (const stepList of trajectory.steps) {
71
+ stepList.reverse();
72
+ }
73
+ if (inputs.length !== trajectory.results.length) {
74
+ console.warn("Trajectory parsing may be incomplete: inputs and results have different lengths");
75
+ }
76
+ else if (inputs.length !== trajectory.steps.length) {
77
+ console.warn("Trajectory parsing may be incomplete: inputs and steps have different lengths");
78
+ }
79
+ return { inputs, outputs: trajectory };
80
+ };
81
+ exports.extractLangGraphTrajectoryFromSnapshots = extractLangGraphTrajectoryFromSnapshots;
82
+ const _getLangGraphStateHistoryRecursive = async (
83
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
84
+ graph, config) => {
85
+ const stateHistory = [];
86
+ for await (const history of graph.getStateHistory(config)) {
87
+ if (history.tasks?.length) {
88
+ for (const task of history.tasks) {
89
+ if (task.state?.configurable?.checkpoint_ns) {
90
+ stateHistory.push(...(await (0, exports._getLangGraphStateHistoryRecursive)(graph, task.state)));
91
+ }
92
+ }
93
+ }
94
+ stateHistory.push(history);
95
+ }
96
+ return stateHistory;
97
+ };
98
+ exports._getLangGraphStateHistoryRecursive = _getLangGraphStateHistoryRecursive;
99
+ const extractLangGraphTrajectoryFromThread = async (
100
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
101
+ graph, config) => {
102
+ const history = await (0, exports._getLangGraphStateHistoryRecursive)(graph, config);
103
+ return (0, exports.extractLangGraphTrajectoryFromSnapshots)(history);
104
+ };
105
+ exports.extractLangGraphTrajectoryFromThread = extractLangGraphTrajectoryFromThread;
@@ -0,0 +1,12 @@
1
+ import type { StateSnapshot, Pregel } from "@langchain/langgraph/web";
2
+ import type { RunnableConfig } from "@langchain/core/runnables";
3
+ import type { GraphTrajectory } from "../types.js";
4
+ export declare const extractLangGraphTrajectoryFromSnapshots: (snapshots: StateSnapshot[]) => {
5
+ inputs: (string | Record<string, unknown> | null)[];
6
+ outputs: GraphTrajectory;
7
+ };
8
+ export declare const _getLangGraphStateHistoryRecursive: (graph: Pregel<any, any>, config: RunnableConfig) => Promise<StateSnapshot[]>;
9
+ export declare const extractLangGraphTrajectoryFromThread: (graph: Pregel<any, any>, config: RunnableConfig) => Promise<{
10
+ inputs: (string | Record<string, unknown> | null)[];
11
+ outputs: GraphTrajectory;
12
+ }>;
@@ -0,0 +1,99 @@
1
+ import { isBaseMessage } from "@langchain/core/messages";
2
+ import { _convertMessagesToOpenAIParams } from "@langchain/openai";
3
+ export const extractLangGraphTrajectoryFromSnapshots = (snapshots) => {
4
+ const inputs = [];
5
+ const trajectory = {
6
+ results: [],
7
+ steps: [],
8
+ };
9
+ let isAccumulatingSteps = false;
10
+ for (let i = 0; i < snapshots.length; i += 1) {
11
+ const snapshot = snapshots[i];
12
+ const hasInterrupts = snapshot.tasks?.find((task) => {
13
+ return task.interrupts?.length;
14
+ });
15
+ if (!snapshot.next?.length || hasInterrupts) {
16
+ isAccumulatingSteps = true;
17
+ if (hasInterrupts) {
18
+ trajectory.results.push({});
19
+ }
20
+ else if (snapshot.values != null &&
21
+ typeof snapshot.values === "object" &&
22
+ !Array.isArray(snapshot.values) &&
23
+ "messages" in snapshot.values &&
24
+ Array.isArray(snapshot.values.messages)) {
25
+ const lastMessage = snapshot.values.messages.at(-1);
26
+ if (isBaseMessage(lastMessage)) {
27
+ // Just append the last message in the output to the results to reduce context size
28
+ trajectory.results.push({
29
+ messages: _convertMessagesToOpenAIParams([lastMessage]),
30
+ });
31
+ }
32
+ else {
33
+ trajectory.results.push({ messages: [lastMessage] });
34
+ }
35
+ }
36
+ else {
37
+ trajectory.results.push(snapshot.values);
38
+ }
39
+ trajectory.steps.push([]);
40
+ }
41
+ if (isAccumulatingSteps && snapshot.tasks?.length) {
42
+ const checkpointNs = snapshot.config?.configurable?.checkpoint_ns ?? "";
43
+ let subgraphPath = "";
44
+ if (checkpointNs.split(":").length > 1) {
45
+ subgraphPath = `${checkpointNs.split(":")[0]}:`;
46
+ }
47
+ for (const task of snapshot.tasks) {
48
+ if (task.interrupts?.length) {
49
+ trajectory.steps.at(-1)?.push("__interrupt__");
50
+ }
51
+ trajectory.steps.at(-1)?.push(`${subgraphPath}${task.name}`);
52
+ }
53
+ }
54
+ if (isAccumulatingSteps) {
55
+ if (snapshot.metadata != null && snapshot.metadata.source === "input") {
56
+ inputs.push(snapshot.metadata.writes);
57
+ }
58
+ else if (i + 1 < snapshots.length &&
59
+ snapshots[i + 1].tasks?.find((task) => task.interrupts?.length > 0)) {
60
+ inputs.push("__resuming__");
61
+ }
62
+ }
63
+ }
64
+ inputs.reverse();
65
+ trajectory.results.reverse();
66
+ trajectory.steps.reverse();
67
+ for (const stepList of trajectory.steps) {
68
+ stepList.reverse();
69
+ }
70
+ if (inputs.length !== trajectory.results.length) {
71
+ console.warn("Trajectory parsing may be incomplete: inputs and results have different lengths");
72
+ }
73
+ else if (inputs.length !== trajectory.steps.length) {
74
+ console.warn("Trajectory parsing may be incomplete: inputs and steps have different lengths");
75
+ }
76
+ return { inputs, outputs: trajectory };
77
+ };
78
+ export const _getLangGraphStateHistoryRecursive = async (
79
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
80
+ graph, config) => {
81
+ const stateHistory = [];
82
+ for await (const history of graph.getStateHistory(config)) {
83
+ if (history.tasks?.length) {
84
+ for (const task of history.tasks) {
85
+ if (task.state?.configurable?.checkpoint_ns) {
86
+ stateHistory.push(...(await _getLangGraphStateHistoryRecursive(graph, task.state)));
87
+ }
88
+ }
89
+ }
90
+ stateHistory.push(history);
91
+ }
92
+ return stateHistory;
93
+ };
94
+ export const extractLangGraphTrajectoryFromThread = async (
95
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
96
+ graph, config) => {
97
+ const history = await _getLangGraphStateHistoryRecursive(graph, config);
98
+ return extractLangGraphTrajectoryFromSnapshots(history);
99
+ };
package/dist/index.cjs ADDED
@@ -0,0 +1,35 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
+ };
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ exports.GRAPH_TRAJECTORY_ACCURACY_PROMPT = exports.createGraphTrajectoryLLMAsJudge = exports.TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = exports.TRAJECTORY_ACCURACY_PROMPT = exports.createTrajectoryLLMAsJudge = exports.trajectoryUnorderedMatch = exports.trajectorySuperset = exports.trajectorySubset = exports.trajectoryStrictMatch = void 0;
18
+ var strict_js_1 = require("./trajectory/strict.cjs");
19
+ Object.defineProperty(exports, "trajectoryStrictMatch", { enumerable: true, get: function () { return strict_js_1.trajectoryStrictMatch; } });
20
+ var subset_js_1 = require("./trajectory/subset.cjs");
21
+ Object.defineProperty(exports, "trajectorySubset", { enumerable: true, get: function () { return subset_js_1.trajectorySubset; } });
22
+ var superset_js_1 = require("./trajectory/superset.cjs");
23
+ Object.defineProperty(exports, "trajectorySuperset", { enumerable: true, get: function () { return superset_js_1.trajectorySuperset; } });
24
+ var unordered_js_1 = require("./trajectory/unordered.cjs");
25
+ Object.defineProperty(exports, "trajectoryUnorderedMatch", { enumerable: true, get: function () { return unordered_js_1.trajectoryUnorderedMatch; } });
26
+ var llm_js_1 = require("./trajectory/llm.cjs");
27
+ Object.defineProperty(exports, "createTrajectoryLLMAsJudge", { enumerable: true, get: function () { return llm_js_1.createTrajectoryLLMAsJudge; } });
28
+ Object.defineProperty(exports, "TRAJECTORY_ACCURACY_PROMPT", { enumerable: true, get: function () { return llm_js_1.TRAJECTORY_ACCURACY_PROMPT; } });
29
+ Object.defineProperty(exports, "TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE", { enumerable: true, get: function () { return llm_js_1.TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE; } });
30
+ var llm_js_2 = require("./graph_trajectory/llm.cjs");
31
+ Object.defineProperty(exports, "createGraphTrajectoryLLMAsJudge", { enumerable: true, get: function () { return llm_js_2.createGraphTrajectoryLLMAsJudge; } });
32
+ Object.defineProperty(exports, "GRAPH_TRAJECTORY_ACCURACY_PROMPT", { enumerable: true, get: function () { return llm_js_2.GRAPH_TRAJECTORY_ACCURACY_PROMPT; } });
33
+ __exportStar(require("./types.cjs"), exports);
34
+ __exportStar(require("./utils.cjs"), exports);
35
+ __exportStar(require("./graph_trajectory/utils.cjs"), exports);
package/dist/index.d.ts CHANGED
@@ -2,5 +2,8 @@ export { trajectoryStrictMatch } from "./trajectory/strict.js";
2
2
  export { trajectorySubset } from "./trajectory/subset.js";
3
3
  export { trajectorySuperset } from "./trajectory/superset.js";
4
4
  export { trajectoryUnorderedMatch } from "./trajectory/unordered.js";
5
- export { createTrajectoryLLMAsJudge, DEFAULT_PROMPT, } from "./trajectory/llm.js";
5
+ export { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, } from "./trajectory/llm.js";
6
+ export { createGraphTrajectoryLLMAsJudge, GRAPH_TRAJECTORY_ACCURACY_PROMPT, } from "./graph_trajectory/llm.js";
6
7
  export * from "./types.js";
8
+ export * from "./utils.js";
9
+ export * from "./graph_trajectory/utils.js";
package/dist/index.js CHANGED
@@ -2,5 +2,8 @@ export { trajectoryStrictMatch } from "./trajectory/strict.js";
2
2
  export { trajectorySubset } from "./trajectory/subset.js";
3
3
  export { trajectorySuperset } from "./trajectory/superset.js";
4
4
  export { trajectoryUnorderedMatch } from "./trajectory/unordered.js";
5
- export { createTrajectoryLLMAsJudge, DEFAULT_PROMPT, } from "./trajectory/llm.js";
5
+ export { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, } from "./trajectory/llm.js";
6
+ export { createGraphTrajectoryLLMAsJudge, GRAPH_TRAJECTORY_ACCURACY_PROMPT, } from "./graph_trajectory/llm.js";
6
7
  export * from "./types.js";
8
+ export * from "./utils.js";
9
+ export * from "./graph_trajectory/utils.js";