npm - agentevals - Versions diffs - 0.0.0 - Mend

agentevals 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

package/LICENSE +19 -0
package/README.md +1 -0
package/dist/evaluators/exact.cjs +23 -0
package/dist/evaluators/exact.d.ts +10 -0
package/dist/evaluators/exact.js +19 -0
package/dist/evaluators/llm.cjs +284 -0
package/dist/evaluators/llm.d.ts +73 -0
package/dist/evaluators/llm.js +279 -0
package/dist/evaluators/prompts/conciseness.cjs +42 -0
package/dist/evaluators/prompts/conciseness.d.ts +1 -0
package/dist/evaluators/prompts/conciseness.js +39 -0
package/dist/evaluators/prompts/correctness.cjs +46 -0
package/dist/evaluators/prompts/correctness.d.ts +1 -0
package/dist/evaluators/prompts/correctness.js +43 -0
package/dist/evaluators/prompts/hallucination.cjs +46 -0
package/dist/evaluators/prompts/hallucination.d.ts +1 -0
package/dist/evaluators/prompts/hallucination.js +43 -0
package/dist/evaluators/string/embedding_similarity.cjs +49 -0
package/dist/evaluators/string/embedding_similarity.d.ts +18 -0
package/dist/evaluators/string/embedding_similarity.js +45 -0
package/dist/evaluators/string/levenshtein.cjs +57 -0
package/dist/evaluators/string/levenshtein.d.ts +11 -0
package/dist/evaluators/string/levenshtein.js +53 -0
package/dist/evaluators/trajectory/llm.cjs +86 -0
package/dist/evaluators/trajectory/llm.d.ts +49 -0
package/dist/evaluators/trajectory/llm.js +82 -0
package/dist/evaluators/trajectory/strict.cjs +58 -0
package/dist/evaluators/trajectory/strict.d.ts +10 -0
package/dist/evaluators/trajectory/strict.js +54 -0
package/dist/evaluators/trajectory/subset.cjs +32 -0
package/dist/evaluators/trajectory/subset.d.ts +23 -0
package/dist/evaluators/trajectory/subset.js +28 -0
package/dist/evaluators/trajectory/superset.cjs +32 -0
package/dist/evaluators/trajectory/superset.d.ts +23 -0
package/dist/evaluators/trajectory/superset.js +28 -0
package/dist/evaluators/trajectory/unordered.cjs +33 -0
package/dist/evaluators/trajectory/unordered.d.ts +23 -0
package/dist/evaluators/trajectory/unordered.js +29 -0
package/dist/evaluators/trajectory/utils.cjs +68 -0
package/dist/evaluators/trajectory/utils.d.ts +3 -0
package/dist/evaluators/trajectory/utils.js +63 -0
package/dist/evaluators/types.cjs +2 -0
package/dist/evaluators/types.d.ts +44 -0
package/dist/evaluators/types.js +1 -0
package/dist/evaluators/utils.cjs +85 -0
package/dist/evaluators/utils.d.ts +13 -0
package/dist/evaluators/utils.js +78 -0
package/dist/index.cjs +43 -0
package/dist/index.d.ts +13 -0
package/dist/index.js +13 -0
package/index.cjs +1 -0
package/index.d.cts +1 -0
package/index.d.ts +1 -0
package/index.js +1 -0
package/package.json +60 -0

package/dist/evaluators/llm.js ADDED Viewed

@@ -0,0 +1,279 @@
+import { Runnable } from "@langchain/core/runnables";
+import { ChatPromptTemplate } from "@langchain/core/prompts";
+import { initChatModel } from "langchain/chat_models/universal";
+import { traceable } from "langsmith/traceable";
+import { _runEvaluator, _normalizeToOpenAIMessagesList } from "./utils.js";
+function _isRunnableInterface(prompt) {
+    return Runnable.isRunnable(prompt);
+}
+function _isBaseChatModel(x) {
+    const model = x;
+    return (x != null &&
+        typeof x === "object" &&
+        typeof model._modelType === "function" &&
+        model._modelType() === "base_chat_model");
+}
+function appendFewShotExamples({ messages, fewShotExamples, }) {
+    // Find the last user message to append examples to
+    const lastUserMessageIdx = messages
+        .slice()
+        .reverse()
+        .findIndex((msg) => msg.role === "user");
+    if (lastUserMessageIdx === -1) {
+        throw new Error("Appending few-shot examples requires a user message in the provided prompt");
+    }
+    const actualIdx = messages.length - 1 - lastUserMessageIdx;
+    messages[actualIdx].content +=
+        "\n\n" +
+            fewShotExamples
+                .map((example) => {
+                let exampleStr = `<example>\n<input>${JSON.stringify(example.inputs)}</input>\n<output>${JSON.stringify(example.outputs)}</output>`;
+                if (example.reasoning) {
+                    exampleStr += `\n<reasoning>${example.reasoning}</reasoning>`;
+                }
+                if (example.score !== undefined) {
+                    exampleStr += `\n<score>${example.score}</score>`;
+                }
+                exampleStr += "\n</example>";
+                return exampleStr;
+            })
+                .join("\n");
+    return messages;
+}
+function constructOutputSchema({ schema, continuous, choices, useReasoning, }) {
+    const jsonSchema = schema ?? {
+        type: "object",
+        additionalProperties: false,
+        strict: true,
+    };
+    let description;
+    let scoreSchema;
+    if (choices) {
+        description =
+            "A number that represents the degree to which the criteria in the prompt are met.";
+        scoreSchema = {
+            type: "number",
+            description,
+            enum: choices,
+        };
+    }
+    else if (continuous) {
+        description =
+            "A number that represents the degree to which the criteria in the prompt are met, from 0.0 to 1.0. 1.0 means the criteria are met perfectly. 0.0 means none of the criteria are met.";
+        scoreSchema = {
+            type: "number",
+            description,
+        };
+    }
+    else {
+        description =
+            "A score that is true if criteria in the prompt are met, and false otherwise.";
+        scoreSchema = {
+            type: "boolean",
+            description,
+        };
+    }
+    if (!schema) {
+        if (useReasoning) {
+            jsonSchema.properties = {
+                reasoning: {
+                    type: "string",
+                    description: "A human-readable explanation of the score. You MUST end the reasoning with a sentence that says: Thus, the score should be: SCORE_YOU_ASSIGN.",
+                },
+                score: scoreSchema,
+            };
+            jsonSchema.required = ["reasoning", "score"];
+        }
+        else {
+            jsonSchema.properties = {
+                score: scoreSchema,
+            };
+            jsonSchema.required = ["score"];
+        }
+    }
+    return [jsonSchema, description];
+}
+export const _createLLMAsJudgeScorer = (params) => {
+    const { prompt, system, schema, model, continuous, choices, fewShotExamples, } = params;
+    let judge = params.judge;
+    const useReasoning = params.useReasoning ?? true;
+    const getScore = async (params) => {
+        let { inputs, outputs, referenceOutputs, ...rest } = params;
+        if (system && typeof prompt !== "string") {
+            throw new Error("`system` is only supported when `prompt` is a string template");
+        }
+        let stringifiedInputs = inputs;
+        let stringifiedOutputs = outputs;
+        let stringifiedReferenceOutputs = referenceOutputs;
+        if (inputs && typeof inputs !== "string") {
+            stringifiedInputs = JSON.stringify(inputs);
+        }
+        if (outputs && typeof outputs !== "string") {
+            stringifiedOutputs = JSON.stringify(outputs);
+        }
+        if (referenceOutputs && typeof referenceOutputs !== "string") {
+            stringifiedReferenceOutputs = JSON.stringify(referenceOutputs);
+        }
+        const stringifiedRest = Object.fromEntries(Object.entries(rest).map(([key, value]) => [
+            key,
+            typeof value === "string" ? value : JSON.stringify(value),
+        ]));
+        let messages = [];
+        if (_isRunnableInterface(prompt)) {
+            const formattedPrompt = await prompt.invoke({
+                inputs: stringifiedInputs,
+                outputs: stringifiedOutputs,
+                reference_outputs: stringifiedReferenceOutputs,
+                ...stringifiedRest,
+            });
+            messages = formattedPrompt.messages;
+        }
+        else if (typeof prompt === "string") {
+            const template = ChatPromptTemplate.fromTemplate(prompt);
+            const formattedPrompt = await template.invoke({
+                inputs: stringifiedInputs,
+                outputs: stringifiedOutputs,
+                reference_outputs: stringifiedReferenceOutputs,
+                ...stringifiedRest,
+            });
+            messages = formattedPrompt.messages;
+        }
+        else {
+            messages = await prompt({
+                inputs,
+                outputs,
+                reference_outputs: referenceOutputs,
+                ...rest,
+            });
+        }
+        if (system) {
+            messages = [{ role: "system", content: system }, ...messages];
+        }
+        let normalizedMessages = _normalizeToOpenAIMessagesList(messages);
+        if (fewShotExamples) {
+            normalizedMessages = appendFewShotExamples({
+                messages: normalizedMessages,
+                fewShotExamples,
+            });
+        }
+        const [jsonSchema, description] = constructOutputSchema({
+            schema,
+            continuous,
+            choices,
+            useReasoning,
+        });
+        if (!judge) {
+            judge = await initChatModel(model);
+        }
+        let response;
+        if (_isBaseChatModel(judge)) {
+            response = await judge
+                .withStructuredOutput({
+                title: "score",
+                description,
+                ...jsonSchema,
+            })
+                .invoke(normalizedMessages);
+            if (schema === undefined) {
+                if (useReasoning) {
+                    return [response.score, response.reasoning];
+                }
+                return response.score;
+            }
+            else {
+                return response;
+            }
+        }
+        else {
+            if (!model) {
+                throw new Error("`model` is required for non-LangChain clients");
+            }
+            const params = {
+                messages: normalizedMessages,
+                model,
+                response_format: {
+                    type: "json_schema",
+                    json_schema: {
+                        name: "score",
+                        strict: true,
+                        schema: jsonSchema,
+                    },
+                },
+            };
+            const invokeLlm = traceable(judge.chat.completions.create.bind(judge.chat.completions), {
+                metadata: {
+                    ls_provider: "openai",
+                    ls_model_name: model,
+                    ls_model_type: "chat",
+                },
+                run_type: "llm",
+                name: "OpenAI Chat Completion",
+            });
+            const response = await invokeLlm(params);
+            const parsed = JSON.parse(response.choices[0].message.content);
+            if (schema === undefined) {
+                if (useReasoning) {
+                    return [parsed.score, parsed.reasoning];
+                }
+                return parsed.score;
+            }
+            return parsed;
+        }
+    };
+    return getScore;
+};
+/**
+ * Create an evaluator that uses an LLM to assess output quality based on specified criteria.
+ *
+ * @param params Configuration object with the following properties:
+ * @param params.prompt The evaluation prompt - can be a string template, LangChain prompt template,
+ *                     or function that returns a list of chat messages
+ * @param params.feedbackKey Key used to store the evaluation result, defaults to "score"
+ * @param params.judge The LLM used for evaluation. Can be an OpenAI client or a LangChain model.
+ *                     If using OpenAI client, must specify "model" parameter.
+ *                     If omitted, "model" will be used to instantiate a LangChain model instance.
+ * @param params.model Model identifier to use. Defaults to "openai:o3-mini".
+ *                     If "judge" is an OpenAI client, this should be a model name directly.
+ *                     If "judge" is omitted, must be a valid LangChain model identifier.
+ * @param params.system Optional system message to prepend to the prompt
+ * @param params.continuous If true, score will be a float between 0 and 1.
+ *                         If false, score will be boolean. Defaults to false.
+ * @param params.choices Optional list of specific float values the score must be chosen from
+ * @param params.useReasoning If true, includes explanation for the score in the output.
+ *                           Defaults to true.
+ * @param params.fewShotExamples Optional list of example evaluations to append to the prompt
+ *
+ * @returns A function that takes inputs, outputs, reference_outputs, and other kwargs,
+ *          formats them into a prompt, invokes the judge, and returns an evaluation result
+ *
+ * @example
+ * ```typescript
+ * import { createLLMAsJudge } from "openevals";
+ *
+ * const evaluator = createLLMAsJudge({
+ *   prompt: "Rate the quality of this response from 0 to 1: {outputs}",
+ *   continuous: true,
+ * });
+ * const result = await evaluator({
+ *   inputs: { question: "What color is the sky?" },
+ *   outputs: { response: "Blue" },
+ * });
+ * ```
+ */
+export const createLLMAsJudge = ({ prompt, feedbackKey = "score", model = "openai:o3-mini", system, judge, continuous = false, choices, useReasoning = true, fewShotExamples, }) => {
+    const scorer = _createLLMAsJudgeScorer({
+        prompt,
+        judge,
+        model,
+        system,
+        continuous,
+        choices,
+        useReasoning,
+        fewShotExamples,
+    });
+    const _wrappedEvaluator = async (inputs) => {
+        const runName = feedbackKey !== "score" ? "llm_as_judge" : `llm_as_${feedbackKey}_judge`;
+        return _runEvaluator(runName, scorer, feedbackKey, inputs);
+    };
+    return _wrappedEvaluator;
+};

package/dist/evaluators/prompts/conciseness.cjs ADDED Viewed

@@ -0,0 +1,42 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.CONCISENESS_PROMPT = void 0;
+exports.CONCISENESS_PROMPT = `You are an expert data labeler evaluating model outputs for conciseness. Your task is to assign a score based on the following rubric:
+<Rubric>
+  A perfectly concise answer:
+  - Contains only the exact information requested.
+  - Uses the minimum number of words necessary to convey the complete answer.
+  - Omits pleasantries, hedging language, and unnecessary context.
+  - Excludes meta-commentary about the answer or the model's capabilities.
+  - Avoids redundant information or restatements.
+  - Does not include explanations unless explicitly requested.
+  When scoring, you should deduct points for:
+  - Introductory phrases like "I believe," "I think," or "The answer is."
+  - Hedging language like "probably," "likely," or "as far as I know."
+  - Unnecessary context or background information.
+  - Explanations when not requested.
+  - Follow-up questions or offers for more information.
+  - Redundant information or restatements.
+  - Polite phrases like "hope this helps" or "let me know if you need anything else."
+</Rubric>
+<Instructions>
+  - Carefully read the input and output.
+  - Check for any unnecessary elements, particularly those mentioned in the <Rubric> above.
+  - The score should reflect how close the response comes to containing only the essential information requested based on the rubric above.
+</Instructions>
+<Reminder>
+  The goal is to reward responses that provide complete answers with absolutely no extraneous information.
+</Reminder>
+<input>
+{inputs}
+</input>
+<output>
+{outputs}
+</output>
+`;

package/dist/evaluators/prompts/conciseness.d.ts ADDED Viewed

@@ -0,0 +1 @@

+ export declare const CONCISENESS_PROMPT = "You are an expert data labeler evaluating model outputs for conciseness. Your task is to assign a score based on the following rubric:\n\n<Rubric>\n A perfectly concise answer:\n - Contains only the exact information requested.\n - Uses the minimum number of words necessary to convey the complete answer.\n - Omits pleasantries, hedging language, and unnecessary context.\n - Excludes meta-commentary about the answer or the model's capabilities.\n - Avoids redundant information or restatements.\n - Does not include explanations unless explicitly requested.\n\n When scoring, you should deduct points for:\n - Introductory phrases like \"I believe,\" \"I think,\" or \"The answer is.\"\n - Hedging language like \"probably,\" \"likely,\" or \"as far as I know.\"\n - Unnecessary context or background information.\n - Explanations when not requested.\n - Follow-up questions or offers for more information.\n - Redundant information or restatements.\n - Polite phrases like \"hope this helps\" or \"let me know if you need anything else.\"\n</Rubric>\n\n<Instructions>\n - Carefully read the input and output.\n - Check for any unnecessary elements, particularly those mentioned in the <Rubric> above.\n - The score should reflect how close the response comes to containing only the essential information requested based on the rubric above.\n</Instructions>\n\n<Reminder>\n The goal is to reward responses that provide complete answers with absolutely no extraneous information.\n</Reminder>\n\n<input>\n{inputs}\n</input>\n\n<output>\n{outputs}\n</output>\n";

package/dist/evaluators/prompts/conciseness.js ADDED Viewed

@@ -0,0 +1,39 @@
+export const CONCISENESS_PROMPT = `You are an expert data labeler evaluating model outputs for conciseness. Your task is to assign a score based on the following rubric:
+<Rubric>
+  A perfectly concise answer:
+  - Contains only the exact information requested.
+  - Uses the minimum number of words necessary to convey the complete answer.
+  - Omits pleasantries, hedging language, and unnecessary context.
+  - Excludes meta-commentary about the answer or the model's capabilities.
+  - Avoids redundant information or restatements.
+  - Does not include explanations unless explicitly requested.
+  When scoring, you should deduct points for:
+  - Introductory phrases like "I believe," "I think," or "The answer is."
+  - Hedging language like "probably," "likely," or "as far as I know."
+  - Unnecessary context or background information.
+  - Explanations when not requested.
+  - Follow-up questions or offers for more information.
+  - Redundant information or restatements.
+  - Polite phrases like "hope this helps" or "let me know if you need anything else."
+</Rubric>
+<Instructions>
+  - Carefully read the input and output.
+  - Check for any unnecessary elements, particularly those mentioned in the <Rubric> above.
+  - The score should reflect how close the response comes to containing only the essential information requested based on the rubric above.
+</Instructions>
+<Reminder>
+  The goal is to reward responses that provide complete answers with absolutely no extraneous information.
+</Reminder>
+<input>
+{inputs}
+</input>
+<output>
+{outputs}
+</output>
+`;

package/dist/evaluators/prompts/correctness.cjs ADDED Viewed

@@ -0,0 +1,46 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.CORRECTNESS_PROMPT = void 0;
+exports.CORRECTNESS_PROMPT = `You are an expert data labeler evaluating model outputs for correctness. Your task is to assign a score based on the following rubric:
+<Rubric>
+  A correct answer:
+  - Provides accurate and complete information
+  - Contains no factual errors
+  - Addresses all parts of the question
+  - Is logically consistent
+  - Uses precise and accurate terminology
+  When scoring, you should penalize:
+  - Factual errors or inaccuracies
+  - Incomplete or partial answers
+  - Misleading or ambiguous statements
+  - Incorrect terminology
+  - Logical inconsistencies
+  - Missing key information
+</Rubric>
+<Instructions>
+  - Carefully read the input and output
+  - Check for factual accuracy and completeness
+  - Focus on correctness of information rather than style or verbosity
+</Instructions>
+<Reminder>
+  The goal is to evaluate factual correctness and completeness of the response.
+</Reminder>
+<input>
+{inputs}
+</input>
+<output>
+{outputs}
+</output>
+If available, you may use the reference outputs below to help you evaluate the correctness of the response:
+<reference_outputs>
+{reference_outputs}
+</reference_outputs>
+`;

package/dist/evaluators/prompts/correctness.d.ts ADDED Viewed

@@ -0,0 +1 @@

+ export declare const CORRECTNESS_PROMPT = "You are an expert data labeler evaluating model outputs for correctness. Your task is to assign a score based on the following rubric:\n\n<Rubric>\n A correct answer:\n - Provides accurate and complete information\n - Contains no factual errors\n - Addresses all parts of the question\n - Is logically consistent\n - Uses precise and accurate terminology\n\n When scoring, you should penalize:\n - Factual errors or inaccuracies\n - Incomplete or partial answers\n - Misleading or ambiguous statements\n - Incorrect terminology\n - Logical inconsistencies\n - Missing key information\n</Rubric>\n\n<Instructions>\n - Carefully read the input and output\n - Check for factual accuracy and completeness\n - Focus on correctness of information rather than style or verbosity\n</Instructions>\n\n<Reminder>\n The goal is to evaluate factual correctness and completeness of the response.\n</Reminder>\n\n<input>\n{inputs}\n</input>\n\n<output>\n{outputs}\n</output>\n\nIf available, you may use the reference outputs below to help you evaluate the correctness of the response:\n\n<reference_outputs>\n{reference_outputs}\n</reference_outputs>\n";

package/dist/evaluators/prompts/correctness.js ADDED Viewed

@@ -0,0 +1,43 @@
+export const CORRECTNESS_PROMPT = `You are an expert data labeler evaluating model outputs for correctness. Your task is to assign a score based on the following rubric:
+<Rubric>
+  A correct answer:
+  - Provides accurate and complete information
+  - Contains no factual errors
+  - Addresses all parts of the question
+  - Is logically consistent
+  - Uses precise and accurate terminology
+  When scoring, you should penalize:
+  - Factual errors or inaccuracies
+  - Incomplete or partial answers
+  - Misleading or ambiguous statements
+  - Incorrect terminology
+  - Logical inconsistencies
+  - Missing key information
+</Rubric>
+<Instructions>
+  - Carefully read the input and output
+  - Check for factual accuracy and completeness
+  - Focus on correctness of information rather than style or verbosity
+</Instructions>
+<Reminder>
+  The goal is to evaluate factual correctness and completeness of the response.
+</Reminder>
+<input>
+{inputs}
+</input>
+<output>
+{outputs}
+</output>
+If available, you may use the reference outputs below to help you evaluate the correctness of the response:
+<reference_outputs>
+{reference_outputs}
+</reference_outputs>
+`;

package/dist/evaluators/prompts/hallucination.cjs ADDED Viewed

@@ -0,0 +1,46 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.HALLUCINATION_PROMPT = void 0;
+exports.HALLUCINATION_PROMPT = `You are an expert data labeler evaluating model outputs for hallucinations. Your task is to assign a score based on the following rubric:
+<Rubric>
+  A response without hallucinations:
+  - Contains only verifiable facts that are directly supported by the input context
+  - Makes no unsupported claims or assumptions
+  - Does not add speculative or imagined details
+  - Maintains perfect accuracy in dates, numbers, and specific details
+  - Appropriately indicates uncertainty when information is incomplete
+</Rubric>
+<Instructions>
+  - Read the input context thoroughly
+  - Identify all claims made in the output
+  - Cross-reference each claim with the input context
+  - Note any unsupported or contradictory information
+  - Consider the severity and quantity of hallucinations
+</Instructions>
+<Reminder>
+  Focus solely on factual accuracy and support from the input context. Do not consider style, grammar, or presentation in scoring. A shorter, factual response should score higher than a longer response with unsupported claims.
+</Reminder>
+Use the following context to help you evaluate for hallucinations in the output:
+<context>
+{context}
+</context>
+<input>
+{inputs}
+</input>
+<output>
+{outputs}
+</output>
+If available, you may also use the reference outputs below to help you identify hallucinations in the response:
+<reference_outputs>
+{reference_outputs}
+</reference_outputs>
+`;

package/dist/evaluators/prompts/hallucination.d.ts ADDED Viewed

@@ -0,0 +1 @@

+ export declare const HALLUCINATION_PROMPT = "You are an expert data labeler evaluating model outputs for hallucinations. Your task is to assign a score based on the following rubric:\n\n<Rubric>\n A response without hallucinations:\n - Contains only verifiable facts that are directly supported by the input context\n - Makes no unsupported claims or assumptions\n - Does not add speculative or imagined details\n - Maintains perfect accuracy in dates, numbers, and specific details\n - Appropriately indicates uncertainty when information is incomplete\n</Rubric>\n\n<Instructions>\n - Read the input context thoroughly\n - Identify all claims made in the output\n - Cross-reference each claim with the input context\n - Note any unsupported or contradictory information\n - Consider the severity and quantity of hallucinations\n</Instructions>\n\n<Reminder>\n Focus solely on factual accuracy and support from the input context. Do not consider style, grammar, or presentation in scoring. A shorter, factual response should score higher than a longer response with unsupported claims.\n</Reminder>\n\nUse the following context to help you evaluate for hallucinations in the output:\n\n<context>\n{context}\n</context>\n\n<input>\n{inputs}\n</input>\n\n<output>\n{outputs}\n</output>\n\nIf available, you may also use the reference outputs below to help you identify hallucinations in the response:\n\n<reference_outputs>\n{reference_outputs}\n</reference_outputs>\n";

package/dist/evaluators/prompts/hallucination.js ADDED Viewed

@@ -0,0 +1,43 @@
+export const HALLUCINATION_PROMPT = `You are an expert data labeler evaluating model outputs for hallucinations. Your task is to assign a score based on the following rubric:
+<Rubric>
+  A response without hallucinations:
+  - Contains only verifiable facts that are directly supported by the input context
+  - Makes no unsupported claims or assumptions
+  - Does not add speculative or imagined details
+  - Maintains perfect accuracy in dates, numbers, and specific details
+  - Appropriately indicates uncertainty when information is incomplete
+</Rubric>
+<Instructions>
+  - Read the input context thoroughly
+  - Identify all claims made in the output
+  - Cross-reference each claim with the input context
+  - Note any unsupported or contradictory information
+  - Consider the severity and quantity of hallucinations
+</Instructions>
+<Reminder>
+  Focus solely on factual accuracy and support from the input context. Do not consider style, grammar, or presentation in scoring. A shorter, factual response should score higher than a longer response with unsupported claims.
+</Reminder>
+Use the following context to help you evaluate for hallucinations in the output:
+<context>
+{context}
+</context>
+<input>
+{inputs}
+</input>
+<output>
+{outputs}
+</output>
+If available, you may also use the reference outputs below to help you identify hallucinations in the response:
+<reference_outputs>
+{reference_outputs}
+</reference_outputs>
+`;

package/dist/evaluators/string/embedding_similarity.cjs ADDED Viewed

@@ -0,0 +1,49 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.createEmbeddingSimilarityEvaluator = void 0;
+const utils_js_1 = require("../utils.cjs");
+// Helper functions for vector calculations
+const handleEmbeddingOutputs = (algorithm, receivedEmbedding, expectedEmbedding) => {
+    const dotProduct = (v1, v2) => v1.reduce((sum, a, i) => sum + a * v2[i], 0);
+    const vectorMagnitude = (v) => Math.sqrt(v.reduce((sum, x) => sum + x * x, 0));
+    const cosineSimilarity = (v1, v2) => {
+        const dotProd = dotProduct(v1, v2);
+        const magnitude1 = vectorMagnitude(v1);
+        const magnitude2 = vectorMagnitude(v2);
+        return dotProd / (magnitude1 * magnitude2);
+    };
+    // Calculate similarity based on chosen algorithm
+    const similarity = algorithm === "cosine"
+        ? cosineSimilarity(receivedEmbedding, expectedEmbedding)
+        : dotProduct(receivedEmbedding, expectedEmbedding);
+    return Number(similarity.toFixed(2));
+};
+/**
+ * Creates an evaluator that compares the actual output and reference output for similarity by text embedding distance.
+ * @param {Object} options - The configuration options
+ * @param {Embeddings} options.embeddings - The embeddings model to use for similarity comparison
+ * @param {('cosine'|'dot_product')} [options.algorithm='cosine'] - The algorithm to use for embedding similarity
+ * @returns An evaluator that returns a score representing the embedding similarity
+ */
+const createEmbeddingSimilarityEvaluator = ({ embeddings, algorithm = "cosine", }) => {
+    if (algorithm !== "cosine" && algorithm !== "dot_product") {
+        throw new Error(`Unsupported algorithm: ${algorithm}. Only 'cosine' and 'dot_product' are supported.`);
+    }
+    return async (params) => {
+        const { outputs, referenceOutputs } = params;
+        if (outputs == null || referenceOutputs == null) {
+            throw new Error("Embedding similarity requires both outputs and referenceOutputs");
+        }
+        const outputString = typeof outputs === "string" ? outputs : JSON.stringify(outputs);
+        const referenceOutputString = typeof referenceOutputs === "string"
+            ? referenceOutputs
+            : JSON.stringify(referenceOutputs);
+        const getScore = async () => {
+            const receivedEmbedding = await embeddings.embedQuery(outputString);
+            const expectedEmbedding = await embeddings.embedQuery(referenceOutputString);
+            return handleEmbeddingOutputs(algorithm, receivedEmbedding, expectedEmbedding);
+        };
+        return (0, utils_js_1._runEvaluator)("embedding_similarity", getScore, "embedding_similarity");
+    };
+};
+exports.createEmbeddingSimilarityEvaluator = createEmbeddingSimilarityEvaluator;

package/dist/evaluators/string/embedding_similarity.d.ts ADDED Viewed

@@ -0,0 +1,18 @@
+import { EvaluatorResult } from "../types.js";
+import { Embeddings } from "@langchain/core/embeddings";
+interface EmbeddingSimilarityOptions {
+    embeddings: Embeddings;
+    algorithm?: "cosine" | "dot_product";
+}
+/**
+ * Creates an evaluator that compares the actual output and reference output for similarity by text embedding distance.
+ * @param {Object} options - The configuration options
+ * @param {Embeddings} options.embeddings - The embeddings model to use for similarity comparison
+ * @param {('cosine'|'dot_product')} [options.algorithm='cosine'] - The algorithm to use for embedding similarity
+ * @returns An evaluator that returns a score representing the embedding similarity
+ */
+export declare const createEmbeddingSimilarityEvaluator: ({ embeddings, algorithm, }: EmbeddingSimilarityOptions) => (params: {
+    outputs: unknown;
+    referenceOutputs: unknown;
+}) => Promise<EvaluatorResult>;
+export {};

package/dist/evaluators/string/embedding_similarity.js ADDED Viewed

@@ -0,0 +1,45 @@
+import { _runEvaluator } from "../utils.js";
+// Helper functions for vector calculations
+const handleEmbeddingOutputs = (algorithm, receivedEmbedding, expectedEmbedding) => {
+    const dotProduct = (v1, v2) => v1.reduce((sum, a, i) => sum + a * v2[i], 0);
+    const vectorMagnitude = (v) => Math.sqrt(v.reduce((sum, x) => sum + x * x, 0));
+    const cosineSimilarity = (v1, v2) => {
+        const dotProd = dotProduct(v1, v2);
+        const magnitude1 = vectorMagnitude(v1);
+        const magnitude2 = vectorMagnitude(v2);
+        return dotProd / (magnitude1 * magnitude2);
+    };
+    // Calculate similarity based on chosen algorithm
+    const similarity = algorithm === "cosine"
+        ? cosineSimilarity(receivedEmbedding, expectedEmbedding)
+        : dotProduct(receivedEmbedding, expectedEmbedding);
+    return Number(similarity.toFixed(2));
+};
+/**
+ * Creates an evaluator that compares the actual output and reference output for similarity by text embedding distance.
+ * @param {Object} options - The configuration options
+ * @param {Embeddings} options.embeddings - The embeddings model to use for similarity comparison
+ * @param {('cosine'|'dot_product')} [options.algorithm='cosine'] - The algorithm to use for embedding similarity
+ * @returns An evaluator that returns a score representing the embedding similarity
+ */
+export const createEmbeddingSimilarityEvaluator = ({ embeddings, algorithm = "cosine", }) => {
+    if (algorithm !== "cosine" && algorithm !== "dot_product") {
+        throw new Error(`Unsupported algorithm: ${algorithm}. Only 'cosine' and 'dot_product' are supported.`);
+    }
+    return async (params) => {
+        const { outputs, referenceOutputs } = params;
+        if (outputs == null || referenceOutputs == null) {
+            throw new Error("Embedding similarity requires both outputs and referenceOutputs");
+        }
+        const outputString = typeof outputs === "string" ? outputs : JSON.stringify(outputs);
+        const referenceOutputString = typeof referenceOutputs === "string"
+            ? referenceOutputs
+            : JSON.stringify(referenceOutputs);
+        const getScore = async () => {
+            const receivedEmbedding = await embeddings.embedQuery(outputString);
+            const expectedEmbedding = await embeddings.embedQuery(referenceOutputString);
+            return handleEmbeddingOutputs(algorithm, receivedEmbedding, expectedEmbedding);
+        };
+        return _runEvaluator("embedding_similarity", getScore, "embedding_similarity");
+    };
+};