npm - agentevals - Versions diffs - 0.0.2 → 0.0.3 - Mend

agentevals 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/trajectory/llm.cjs CHANGED Viewed

@@ -12,16 +12,20 @@ Your task is to grade the accuracy of an AI agent's internal trajectory.
   - Makes logical sense between steps
   - Shows clear progression
   - Is relatively efficient, though it does not need to be perfectly efficient
-  - Is semantically equivalent to the provided reference trajectory, if present
+  - Is semantically equivalent to the provided reference trajectory
 </Rubric>
-Grade the following trajectory:
+Based on the following reference trajectory:
+<reference_trajectory>
+{reference_outputs}
+</reference_trajectory>
+Grade this actual trajectory:
 <trajectory>
 {outputs}
 </trajectory>
-{inputs}
-{reference_outputs}
 `;
 exports.TRAJECTORY_ACCURACY_PROMPT = `You are an expert data labeler.
 Your task is to grade the accuracy of an AI agent's internal trajectory.
@@ -42,27 +46,16 @@ Grade the following trajectory:
 <trajectory>
 {outputs}
-</trajectory>
-{inputs}
-`;
+</trajectory>`;
 function _formatInputs(params) {
-    const { inputs, outputs, referenceOutputs } = params;
+    const { outputs, referenceOutputs } = params;
     const normalizedOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
     const normalizedReferenceOutputs = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs ?? []);
     const formattedReferenceOutputs = normalizedReferenceOutputs
-        ? `\nUse the following trajectory as an example reference when grading:\n<reference_trajectory>\n${(0, utils_js_2._chatCompletionMessagesToString)(normalizedReferenceOutputs)}\n</reference_trajectory>\n`
+        ? (0, utils_js_2._chatCompletionMessagesToString)(normalizedReferenceOutputs)
         : "";
-    const formattedInputs = inputs
-        ? `\nThe agent generated the trajectory from the following input:\n<input>\n${JSON.stringify(inputs)}\n</input>\n`
-        : "";
-    const formattedOutputs = typeof outputs === "object" && !Array.isArray(outputs)
-        ? outputs
-        : (0, utils_js_2._chatCompletionMessagesToString)(normalizedOutputs);
-    return [
-        formattedOutputs,
-        formattedReferenceOutputs,
-        formattedInputs,
-    ];
+    const formattedOutputs = (0, utils_js_2._chatCompletionMessagesToString)(normalizedOutputs);
+    return [formattedOutputs, formattedReferenceOutputs];
 }
 /**
  * Creates an evaluator that uses an LLM to judge agent trajectories.
@@ -97,18 +90,14 @@ const createTrajectoryLLMAsJudge = ({ prompt = exports.TRAJECTORY_ACCURACY_PROMP
         fewShotExamples,
     });
     const wrappedEvaluator = async ({ inputs, outputs, referenceOutputs, ...extra }) => {
-        const [formattedOutputs, formattedReferenceOutputs, formattedInputs] = prompt === exports.TRAJECTORY_ACCURACY_PROMPT ||
-            prompt === exports.TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE
-            ? _formatInputs({ inputs, outputs, referenceOutputs })
-            : [
-                inputs ? JSON.stringify(inputs) : "",
-                (0, utils_js_2._chatCompletionMessagesToString)((0, utils_js_1._normalizeToOpenAIMessagesList)(outputs)),
-                (0, utils_js_2._chatCompletionMessagesToString)((0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs)),
-            ];
+        const [formattedOutputs, formattedReferenceOutputs] = _formatInputs({
+            outputs,
+            referenceOutputs,
+        });
         return (0, utils_js_1._runEvaluator)(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, {
+            inputs,
             outputs: formattedOutputs,
             referenceOutputs: formattedReferenceOutputs,
-            inputs: formattedInputs,
             ...extra,
         });
     };

package/dist/trajectory/llm.d.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import { BaseMessage } from "@langchain/core/messages";
 import { ChatCompletionMessage, EvaluatorResult, TrajectoryLLMAsJudgeParams } from "../types.js";
-export declare const TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n  An accurate trajectory:\n  - Makes logical sense between steps\n  - Shows clear progression\n  - Is relatively efficient, though it does not need to be perfectly efficient\n  - Is semantically equivalent to the provided reference trajectory, if present\n</Rubric>\n\nGrade the following trajectory:\n\n<trajectory>\n{outputs}\n</trajectory>\n{inputs}\n{reference_outputs}\n";
-export declare const TRAJECTORY_ACCURACY_PROMPT = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n  An accurate trajectory:\n  - Makes logical sense between steps\n  - Shows clear progression\n  - Is relatively efficient, though it does not need to be perfectly efficient\n</Rubric>\n\nFirst, try to understand the goal of the trajectory by looking at the input\n(if the input is not present try to infer it from the content of the first message),\nas well as the output of the final message. Once you understand the goal, grade the trajectory\nas it relates to achieving that goal.\n\nGrade the following trajectory:\n\n<trajectory>\n{outputs}\n</trajectory>\n{inputs}\n";
+export declare const TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n  An accurate trajectory:\n  - Makes logical sense between steps\n  - Shows clear progression\n  - Is relatively efficient, though it does not need to be perfectly efficient\n  - Is semantically equivalent to the provided reference trajectory\n</Rubric>\n\nBased on the following reference trajectory:\n\n<reference_trajectory>\n{reference_outputs}\n</reference_trajectory>\n\nGrade this actual trajectory:\n\n<trajectory>\n{outputs}\n</trajectory>\n";
+export declare const TRAJECTORY_ACCURACY_PROMPT = "You are an expert data labeler.\nYour task is to grade the accuracy of an AI agent's internal trajectory.\n\n<Rubric>\n  An accurate trajectory:\n  - Makes logical sense between steps\n  - Shows clear progression\n  - Is relatively efficient, though it does not need to be perfectly efficient\n</Rubric>\n\nFirst, try to understand the goal of the trajectory by looking at the input\n(if the input is not present try to infer it from the content of the first message),\nas well as the output of the final message. Once you understand the goal, grade the trajectory\nas it relates to achieving that goal.\n\nGrade the following trajectory:\n\n<trajectory>\n{outputs}\n</trajectory>";
 /**
  * Creates an evaluator that uses an LLM to judge agent trajectories.
  *
@@ -25,7 +25,6 @@ export declare const TRAJECTORY_ACCURACY_PROMPT = "You are an expert data labele
  */
 export declare const createTrajectoryLLMAsJudge: ({ prompt, feedbackKey, model, system, judge, continuous, choices, useReasoning, fewShotExamples, }: TrajectoryLLMAsJudgeParams) => ({ inputs, outputs, referenceOutputs, ...extra }: {
     [key: string]: unknown;
-    inputs?: Record<string, any> | undefined;
     outputs: ChatCompletionMessage[] | BaseMessage[] | {
         messages: (BaseMessage | ChatCompletionMessage)[];
     };

package/dist/trajectory/llm.js CHANGED Viewed

@@ -9,16 +9,20 @@ Your task is to grade the accuracy of an AI agent's internal trajectory.
   - Makes logical sense between steps
   - Shows clear progression
   - Is relatively efficient, though it does not need to be perfectly efficient
-  - Is semantically equivalent to the provided reference trajectory, if present
+  - Is semantically equivalent to the provided reference trajectory
 </Rubric>
-Grade the following trajectory:
+Based on the following reference trajectory:
+<reference_trajectory>
+{reference_outputs}
+</reference_trajectory>
+Grade this actual trajectory:
 <trajectory>
 {outputs}
 </trajectory>
-{inputs}
-{reference_outputs}
 `;
 export const TRAJECTORY_ACCURACY_PROMPT = `You are an expert data labeler.
 Your task is to grade the accuracy of an AI agent's internal trajectory.
@@ -39,27 +43,16 @@ Grade the following trajectory:
 <trajectory>
 {outputs}
-</trajectory>
-{inputs}
-`;
+</trajectory>`;
 function _formatInputs(params) {
-    const { inputs, outputs, referenceOutputs } = params;
+    const { outputs, referenceOutputs } = params;
     const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs);
     const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList(referenceOutputs ?? []);
     const formattedReferenceOutputs = normalizedReferenceOutputs
-        ? `\nUse the following trajectory as an example reference when grading:\n<reference_trajectory>\n${_chatCompletionMessagesToString(normalizedReferenceOutputs)}\n</reference_trajectory>\n`
+        ? _chatCompletionMessagesToString(normalizedReferenceOutputs)
         : "";
-    const formattedInputs = inputs
-        ? `\nThe agent generated the trajectory from the following input:\n<input>\n${JSON.stringify(inputs)}\n</input>\n`
-        : "";
-    const formattedOutputs = typeof outputs === "object" && !Array.isArray(outputs)
-        ? outputs
-        : _chatCompletionMessagesToString(normalizedOutputs);
-    return [
-        formattedOutputs,
-        formattedReferenceOutputs,
-        formattedInputs,
-    ];
+    const formattedOutputs = _chatCompletionMessagesToString(normalizedOutputs);
+    return [formattedOutputs, formattedReferenceOutputs];
 }
 /**
  * Creates an evaluator that uses an LLM to judge agent trajectories.
@@ -94,18 +87,14 @@ export const createTrajectoryLLMAsJudge = ({ prompt = TRAJECTORY_ACCURACY_PROMPT
         fewShotExamples,
     });
     const wrappedEvaluator = async ({ inputs, outputs, referenceOutputs, ...extra }) => {
-        const [formattedOutputs, formattedReferenceOutputs, formattedInputs] = prompt === TRAJECTORY_ACCURACY_PROMPT ||
-            prompt === TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE
-            ? _formatInputs({ inputs, outputs, referenceOutputs })
-            : [
-                inputs ? JSON.stringify(inputs) : "",
-                _chatCompletionMessagesToString(_normalizeToOpenAIMessagesList(outputs)),
-                _chatCompletionMessagesToString(_normalizeToOpenAIMessagesList(referenceOutputs)),
-            ];
+        const [formattedOutputs, formattedReferenceOutputs] = _formatInputs({
+            outputs,
+            referenceOutputs,
+        });
         return _runEvaluator(`llm_as_${feedbackKey}_judge`, scorer, feedbackKey, {
+            inputs,
             outputs: formattedOutputs,
             referenceOutputs: formattedReferenceOutputs,
-            inputs: formattedInputs,
             ...extra,
         });
     };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "agentevals",
-  "version": "0.0.2",
+  "version": "0.0.3",
   "packageManager": "yarn@3.5.1",
   "type": "module",
   "scripts": {