agentevals 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/LICENSE +19 -0
  2. package/README.md +1 -0
  3. package/dist/evaluators/exact.cjs +23 -0
  4. package/dist/evaluators/exact.d.ts +10 -0
  5. package/dist/evaluators/exact.js +19 -0
  6. package/dist/evaluators/llm.cjs +284 -0
  7. package/dist/evaluators/llm.d.ts +73 -0
  8. package/dist/evaluators/llm.js +279 -0
  9. package/dist/evaluators/prompts/conciseness.cjs +42 -0
  10. package/dist/evaluators/prompts/conciseness.d.ts +1 -0
  11. package/dist/evaluators/prompts/conciseness.js +39 -0
  12. package/dist/evaluators/prompts/correctness.cjs +46 -0
  13. package/dist/evaluators/prompts/correctness.d.ts +1 -0
  14. package/dist/evaluators/prompts/correctness.js +43 -0
  15. package/dist/evaluators/prompts/hallucination.cjs +46 -0
  16. package/dist/evaluators/prompts/hallucination.d.ts +1 -0
  17. package/dist/evaluators/prompts/hallucination.js +43 -0
  18. package/dist/evaluators/string/embedding_similarity.cjs +49 -0
  19. package/dist/evaluators/string/embedding_similarity.d.ts +18 -0
  20. package/dist/evaluators/string/embedding_similarity.js +45 -0
  21. package/dist/evaluators/string/levenshtein.cjs +57 -0
  22. package/dist/evaluators/string/levenshtein.d.ts +11 -0
  23. package/dist/evaluators/string/levenshtein.js +53 -0
  24. package/dist/evaluators/trajectory/llm.cjs +86 -0
  25. package/dist/evaluators/trajectory/llm.d.ts +49 -0
  26. package/dist/evaluators/trajectory/llm.js +82 -0
  27. package/dist/evaluators/trajectory/strict.cjs +58 -0
  28. package/dist/evaluators/trajectory/strict.d.ts +10 -0
  29. package/dist/evaluators/trajectory/strict.js +54 -0
  30. package/dist/evaluators/trajectory/subset.cjs +32 -0
  31. package/dist/evaluators/trajectory/subset.d.ts +23 -0
  32. package/dist/evaluators/trajectory/subset.js +28 -0
  33. package/dist/evaluators/trajectory/superset.cjs +32 -0
  34. package/dist/evaluators/trajectory/superset.d.ts +23 -0
  35. package/dist/evaluators/trajectory/superset.js +28 -0
  36. package/dist/evaluators/trajectory/unordered.cjs +33 -0
  37. package/dist/evaluators/trajectory/unordered.d.ts +23 -0
  38. package/dist/evaluators/trajectory/unordered.js +29 -0
  39. package/dist/evaluators/trajectory/utils.cjs +68 -0
  40. package/dist/evaluators/trajectory/utils.d.ts +3 -0
  41. package/dist/evaluators/trajectory/utils.js +63 -0
  42. package/dist/evaluators/types.cjs +2 -0
  43. package/dist/evaluators/types.d.ts +44 -0
  44. package/dist/evaluators/types.js +1 -0
  45. package/dist/evaluators/utils.cjs +85 -0
  46. package/dist/evaluators/utils.d.ts +13 -0
  47. package/dist/evaluators/utils.js +78 -0
  48. package/dist/index.cjs +43 -0
  49. package/dist/index.d.ts +13 -0
  50. package/dist/index.js +13 -0
  51. package/index.cjs +1 -0
  52. package/index.d.cts +1 -0
  53. package/index.d.ts +1 -0
  54. package/index.js +1 -0
  55. package/package.json +60 -0
@@ -0,0 +1,279 @@
1
+ import { Runnable } from "@langchain/core/runnables";
2
+ import { ChatPromptTemplate } from "@langchain/core/prompts";
3
+ import { initChatModel } from "langchain/chat_models/universal";
4
+ import { traceable } from "langsmith/traceable";
5
+ import { _runEvaluator, _normalizeToOpenAIMessagesList } from "./utils.js";
6
+ function _isRunnableInterface(prompt) {
7
+ return Runnable.isRunnable(prompt);
8
+ }
9
+ function _isBaseChatModel(x) {
10
+ const model = x;
11
+ return (x != null &&
12
+ typeof x === "object" &&
13
+ typeof model._modelType === "function" &&
14
+ model._modelType() === "base_chat_model");
15
+ }
16
+ function appendFewShotExamples({ messages, fewShotExamples, }) {
17
+ // Find the last user message to append examples to
18
+ const lastUserMessageIdx = messages
19
+ .slice()
20
+ .reverse()
21
+ .findIndex((msg) => msg.role === "user");
22
+ if (lastUserMessageIdx === -1) {
23
+ throw new Error("Appending few-shot examples requires a user message in the provided prompt");
24
+ }
25
+ const actualIdx = messages.length - 1 - lastUserMessageIdx;
26
+ messages[actualIdx].content +=
27
+ "\n\n" +
28
+ fewShotExamples
29
+ .map((example) => {
30
+ let exampleStr = `<example>\n<input>${JSON.stringify(example.inputs)}</input>\n<output>${JSON.stringify(example.outputs)}</output>`;
31
+ if (example.reasoning) {
32
+ exampleStr += `\n<reasoning>${example.reasoning}</reasoning>`;
33
+ }
34
+ if (example.score !== undefined) {
35
+ exampleStr += `\n<score>${example.score}</score>`;
36
+ }
37
+ exampleStr += "\n</example>";
38
+ return exampleStr;
39
+ })
40
+ .join("\n");
41
+ return messages;
42
+ }
43
+ function constructOutputSchema({ schema, continuous, choices, useReasoning, }) {
44
+ const jsonSchema = schema ?? {
45
+ type: "object",
46
+ additionalProperties: false,
47
+ strict: true,
48
+ };
49
+ let description;
50
+ let scoreSchema;
51
+ if (choices) {
52
+ description =
53
+ "A number that represents the degree to which the criteria in the prompt are met.";
54
+ scoreSchema = {
55
+ type: "number",
56
+ description,
57
+ enum: choices,
58
+ };
59
+ }
60
+ else if (continuous) {
61
+ description =
62
+ "A number that represents the degree to which the criteria in the prompt are met, from 0.0 to 1.0. 1.0 means the criteria are met perfectly. 0.0 means none of the criteria are met.";
63
+ scoreSchema = {
64
+ type: "number",
65
+ description,
66
+ };
67
+ }
68
+ else {
69
+ description =
70
+ "A score that is true if criteria in the prompt are met, and false otherwise.";
71
+ scoreSchema = {
72
+ type: "boolean",
73
+ description,
74
+ };
75
+ }
76
+ if (!schema) {
77
+ if (useReasoning) {
78
+ jsonSchema.properties = {
79
+ reasoning: {
80
+ type: "string",
81
+ description: "A human-readable explanation of the score. You MUST end the reasoning with a sentence that says: Thus, the score should be: SCORE_YOU_ASSIGN.",
82
+ },
83
+ score: scoreSchema,
84
+ };
85
+ jsonSchema.required = ["reasoning", "score"];
86
+ }
87
+ else {
88
+ jsonSchema.properties = {
89
+ score: scoreSchema,
90
+ };
91
+ jsonSchema.required = ["score"];
92
+ }
93
+ }
94
+ return [jsonSchema, description];
95
+ }
96
+ export const _createLLMAsJudgeScorer = (params) => {
97
+ const { prompt, system, schema, model, continuous, choices, fewShotExamples, } = params;
98
+ let judge = params.judge;
99
+ const useReasoning = params.useReasoning ?? true;
100
+ const getScore = async (params) => {
101
+ let { inputs, outputs, referenceOutputs, ...rest } = params;
102
+ if (system && typeof prompt !== "string") {
103
+ throw new Error("`system` is only supported when `prompt` is a string template");
104
+ }
105
+ let stringifiedInputs = inputs;
106
+ let stringifiedOutputs = outputs;
107
+ let stringifiedReferenceOutputs = referenceOutputs;
108
+ if (inputs && typeof inputs !== "string") {
109
+ stringifiedInputs = JSON.stringify(inputs);
110
+ }
111
+ if (outputs && typeof outputs !== "string") {
112
+ stringifiedOutputs = JSON.stringify(outputs);
113
+ }
114
+ if (referenceOutputs && typeof referenceOutputs !== "string") {
115
+ stringifiedReferenceOutputs = JSON.stringify(referenceOutputs);
116
+ }
117
+ const stringifiedRest = Object.fromEntries(Object.entries(rest).map(([key, value]) => [
118
+ key,
119
+ typeof value === "string" ? value : JSON.stringify(value),
120
+ ]));
121
+ let messages = [];
122
+ if (_isRunnableInterface(prompt)) {
123
+ const formattedPrompt = await prompt.invoke({
124
+ inputs: stringifiedInputs,
125
+ outputs: stringifiedOutputs,
126
+ reference_outputs: stringifiedReferenceOutputs,
127
+ ...stringifiedRest,
128
+ });
129
+ messages = formattedPrompt.messages;
130
+ }
131
+ else if (typeof prompt === "string") {
132
+ const template = ChatPromptTemplate.fromTemplate(prompt);
133
+ const formattedPrompt = await template.invoke({
134
+ inputs: stringifiedInputs,
135
+ outputs: stringifiedOutputs,
136
+ reference_outputs: stringifiedReferenceOutputs,
137
+ ...stringifiedRest,
138
+ });
139
+ messages = formattedPrompt.messages;
140
+ }
141
+ else {
142
+ messages = await prompt({
143
+ inputs,
144
+ outputs,
145
+ reference_outputs: referenceOutputs,
146
+ ...rest,
147
+ });
148
+ }
149
+ if (system) {
150
+ messages = [{ role: "system", content: system }, ...messages];
151
+ }
152
+ let normalizedMessages = _normalizeToOpenAIMessagesList(messages);
153
+ if (fewShotExamples) {
154
+ normalizedMessages = appendFewShotExamples({
155
+ messages: normalizedMessages,
156
+ fewShotExamples,
157
+ });
158
+ }
159
+ const [jsonSchema, description] = constructOutputSchema({
160
+ schema,
161
+ continuous,
162
+ choices,
163
+ useReasoning,
164
+ });
165
+ if (!judge) {
166
+ judge = await initChatModel(model);
167
+ }
168
+ let response;
169
+ if (_isBaseChatModel(judge)) {
170
+ response = await judge
171
+ .withStructuredOutput({
172
+ title: "score",
173
+ description,
174
+ ...jsonSchema,
175
+ })
176
+ .invoke(normalizedMessages);
177
+ if (schema === undefined) {
178
+ if (useReasoning) {
179
+ return [response.score, response.reasoning];
180
+ }
181
+ return response.score;
182
+ }
183
+ else {
184
+ return response;
185
+ }
186
+ }
187
+ else {
188
+ if (!model) {
189
+ throw new Error("`model` is required for non-LangChain clients");
190
+ }
191
+ const params = {
192
+ messages: normalizedMessages,
193
+ model,
194
+ response_format: {
195
+ type: "json_schema",
196
+ json_schema: {
197
+ name: "score",
198
+ strict: true,
199
+ schema: jsonSchema,
200
+ },
201
+ },
202
+ };
203
+ const invokeLlm = traceable(judge.chat.completions.create.bind(judge.chat.completions), {
204
+ metadata: {
205
+ ls_provider: "openai",
206
+ ls_model_name: model,
207
+ ls_model_type: "chat",
208
+ },
209
+ run_type: "llm",
210
+ name: "OpenAI Chat Completion",
211
+ });
212
+ const response = await invokeLlm(params);
213
+ const parsed = JSON.parse(response.choices[0].message.content);
214
+ if (schema === undefined) {
215
+ if (useReasoning) {
216
+ return [parsed.score, parsed.reasoning];
217
+ }
218
+ return parsed.score;
219
+ }
220
+ return parsed;
221
+ }
222
+ };
223
+ return getScore;
224
+ };
225
+ /**
226
+ * Create an evaluator that uses an LLM to assess output quality based on specified criteria.
227
+ *
228
+ * @param params Configuration object with the following properties:
229
+ * @param params.prompt The evaluation prompt - can be a string template, LangChain prompt template,
230
+ * or function that returns a list of chat messages
231
+ * @param params.feedbackKey Key used to store the evaluation result, defaults to "score"
232
+ * @param params.judge The LLM used for evaluation. Can be an OpenAI client or a LangChain model.
233
+ * If using OpenAI client, must specify "model" parameter.
234
+ * If omitted, "model" will be used to instantiate a LangChain model instance.
235
+ * @param params.model Model identifier to use. Defaults to "openai:o3-mini".
236
+ * If "judge" is an OpenAI client, this should be a model name directly.
237
+ * If "judge" is omitted, must be a valid LangChain model identifier.
238
+ * @param params.system Optional system message to prepend to the prompt
239
+ * @param params.continuous If true, score will be a float between 0 and 1.
240
+ * If false, score will be boolean. Defaults to false.
241
+ * @param params.choices Optional list of specific float values the score must be chosen from
242
+ * @param params.useReasoning If true, includes explanation for the score in the output.
243
+ * Defaults to true.
244
+ * @param params.fewShotExamples Optional list of example evaluations to append to the prompt
245
+ *
246
+ * @returns A function that takes inputs, outputs, reference_outputs, and other kwargs,
247
+ * formats them into a prompt, invokes the judge, and returns an evaluation result
248
+ *
249
+ * @example
250
+ * ```typescript
251
+ * import { createLLMAsJudge } from "openevals";
252
+ *
253
+ * const evaluator = createLLMAsJudge({
254
+ * prompt: "Rate the quality of this response from 0 to 1: {outputs}",
255
+ * continuous: true,
256
+ * });
257
+ * const result = await evaluator({
258
+ * inputs: { question: "What color is the sky?" },
259
+ * outputs: { response: "Blue" },
260
+ * });
261
+ * ```
262
+ */
263
+ export const createLLMAsJudge = ({ prompt, feedbackKey = "score", model = "openai:o3-mini", system, judge, continuous = false, choices, useReasoning = true, fewShotExamples, }) => {
264
+ const scorer = _createLLMAsJudgeScorer({
265
+ prompt,
266
+ judge,
267
+ model,
268
+ system,
269
+ continuous,
270
+ choices,
271
+ useReasoning,
272
+ fewShotExamples,
273
+ });
274
+ const _wrappedEvaluator = async (inputs) => {
275
+ const runName = feedbackKey !== "score" ? "llm_as_judge" : `llm_as_${feedbackKey}_judge`;
276
+ return _runEvaluator(runName, scorer, feedbackKey, inputs);
277
+ };
278
+ return _wrappedEvaluator;
279
+ };
@@ -0,0 +1,42 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.CONCISENESS_PROMPT = void 0;
4
+ exports.CONCISENESS_PROMPT = `You are an expert data labeler evaluating model outputs for conciseness. Your task is to assign a score based on the following rubric:
5
+
6
+ <Rubric>
7
+ A perfectly concise answer:
8
+ - Contains only the exact information requested.
9
+ - Uses the minimum number of words necessary to convey the complete answer.
10
+ - Omits pleasantries, hedging language, and unnecessary context.
11
+ - Excludes meta-commentary about the answer or the model's capabilities.
12
+ - Avoids redundant information or restatements.
13
+ - Does not include explanations unless explicitly requested.
14
+
15
+ When scoring, you should deduct points for:
16
+ - Introductory phrases like "I believe," "I think," or "The answer is."
17
+ - Hedging language like "probably," "likely," or "as far as I know."
18
+ - Unnecessary context or background information.
19
+ - Explanations when not requested.
20
+ - Follow-up questions or offers for more information.
21
+ - Redundant information or restatements.
22
+ - Polite phrases like "hope this helps" or "let me know if you need anything else."
23
+ </Rubric>
24
+
25
+ <Instructions>
26
+ - Carefully read the input and output.
27
+ - Check for any unnecessary elements, particularly those mentioned in the <Rubric> above.
28
+ - The score should reflect how close the response comes to containing only the essential information requested based on the rubric above.
29
+ </Instructions>
30
+
31
+ <Reminder>
32
+ The goal is to reward responses that provide complete answers with absolutely no extraneous information.
33
+ </Reminder>
34
+
35
+ <input>
36
+ {inputs}
37
+ </input>
38
+
39
+ <output>
40
+ {outputs}
41
+ </output>
42
+ `;
@@ -0,0 +1 @@
1
+ export declare const CONCISENESS_PROMPT = "You are an expert data labeler evaluating model outputs for conciseness. Your task is to assign a score based on the following rubric:\n\n<Rubric>\n A perfectly concise answer:\n - Contains only the exact information requested.\n - Uses the minimum number of words necessary to convey the complete answer.\n - Omits pleasantries, hedging language, and unnecessary context.\n - Excludes meta-commentary about the answer or the model's capabilities.\n - Avoids redundant information or restatements.\n - Does not include explanations unless explicitly requested.\n\n When scoring, you should deduct points for:\n - Introductory phrases like \"I believe,\" \"I think,\" or \"The answer is.\"\n - Hedging language like \"probably,\" \"likely,\" or \"as far as I know.\"\n - Unnecessary context or background information.\n - Explanations when not requested.\n - Follow-up questions or offers for more information.\n - Redundant information or restatements.\n - Polite phrases like \"hope this helps\" or \"let me know if you need anything else.\"\n</Rubric>\n\n<Instructions>\n - Carefully read the input and output.\n - Check for any unnecessary elements, particularly those mentioned in the <Rubric> above.\n - The score should reflect how close the response comes to containing only the essential information requested based on the rubric above.\n</Instructions>\n\n<Reminder>\n The goal is to reward responses that provide complete answers with absolutely no extraneous information.\n</Reminder>\n\n<input>\n{inputs}\n</input>\n\n<output>\n{outputs}\n</output>\n";
@@ -0,0 +1,39 @@
1
+ export const CONCISENESS_PROMPT = `You are an expert data labeler evaluating model outputs for conciseness. Your task is to assign a score based on the following rubric:
2
+
3
+ <Rubric>
4
+ A perfectly concise answer:
5
+ - Contains only the exact information requested.
6
+ - Uses the minimum number of words necessary to convey the complete answer.
7
+ - Omits pleasantries, hedging language, and unnecessary context.
8
+ - Excludes meta-commentary about the answer or the model's capabilities.
9
+ - Avoids redundant information or restatements.
10
+ - Does not include explanations unless explicitly requested.
11
+
12
+ When scoring, you should deduct points for:
13
+ - Introductory phrases like "I believe," "I think," or "The answer is."
14
+ - Hedging language like "probably," "likely," or "as far as I know."
15
+ - Unnecessary context or background information.
16
+ - Explanations when not requested.
17
+ - Follow-up questions or offers for more information.
18
+ - Redundant information or restatements.
19
+ - Polite phrases like "hope this helps" or "let me know if you need anything else."
20
+ </Rubric>
21
+
22
+ <Instructions>
23
+ - Carefully read the input and output.
24
+ - Check for any unnecessary elements, particularly those mentioned in the <Rubric> above.
25
+ - The score should reflect how close the response comes to containing only the essential information requested based on the rubric above.
26
+ </Instructions>
27
+
28
+ <Reminder>
29
+ The goal is to reward responses that provide complete answers with absolutely no extraneous information.
30
+ </Reminder>
31
+
32
+ <input>
33
+ {inputs}
34
+ </input>
35
+
36
+ <output>
37
+ {outputs}
38
+ </output>
39
+ `;
@@ -0,0 +1,46 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.CORRECTNESS_PROMPT = void 0;
4
+ exports.CORRECTNESS_PROMPT = `You are an expert data labeler evaluating model outputs for correctness. Your task is to assign a score based on the following rubric:
5
+
6
+ <Rubric>
7
+ A correct answer:
8
+ - Provides accurate and complete information
9
+ - Contains no factual errors
10
+ - Addresses all parts of the question
11
+ - Is logically consistent
12
+ - Uses precise and accurate terminology
13
+
14
+ When scoring, you should penalize:
15
+ - Factual errors or inaccuracies
16
+ - Incomplete or partial answers
17
+ - Misleading or ambiguous statements
18
+ - Incorrect terminology
19
+ - Logical inconsistencies
20
+ - Missing key information
21
+ </Rubric>
22
+
23
+ <Instructions>
24
+ - Carefully read the input and output
25
+ - Check for factual accuracy and completeness
26
+ - Focus on correctness of information rather than style or verbosity
27
+ </Instructions>
28
+
29
+ <Reminder>
30
+ The goal is to evaluate factual correctness and completeness of the response.
31
+ </Reminder>
32
+
33
+ <input>
34
+ {inputs}
35
+ </input>
36
+
37
+ <output>
38
+ {outputs}
39
+ </output>
40
+
41
+ If available, you may use the reference outputs below to help you evaluate the correctness of the response:
42
+
43
+ <reference_outputs>
44
+ {reference_outputs}
45
+ </reference_outputs>
46
+ `;
@@ -0,0 +1 @@
1
+ export declare const CORRECTNESS_PROMPT = "You are an expert data labeler evaluating model outputs for correctness. Your task is to assign a score based on the following rubric:\n\n<Rubric>\n A correct answer:\n - Provides accurate and complete information\n - Contains no factual errors\n - Addresses all parts of the question\n - Is logically consistent\n - Uses precise and accurate terminology\n\n When scoring, you should penalize:\n - Factual errors or inaccuracies\n - Incomplete or partial answers\n - Misleading or ambiguous statements\n - Incorrect terminology\n - Logical inconsistencies\n - Missing key information\n</Rubric>\n\n<Instructions>\n - Carefully read the input and output\n - Check for factual accuracy and completeness\n - Focus on correctness of information rather than style or verbosity\n</Instructions>\n\n<Reminder>\n The goal is to evaluate factual correctness and completeness of the response.\n</Reminder>\n\n<input>\n{inputs}\n</input>\n\n<output>\n{outputs}\n</output>\n\nIf available, you may use the reference outputs below to help you evaluate the correctness of the response:\n\n<reference_outputs>\n{reference_outputs}\n</reference_outputs>\n";
@@ -0,0 +1,43 @@
1
+ export const CORRECTNESS_PROMPT = `You are an expert data labeler evaluating model outputs for correctness. Your task is to assign a score based on the following rubric:
2
+
3
+ <Rubric>
4
+ A correct answer:
5
+ - Provides accurate and complete information
6
+ - Contains no factual errors
7
+ - Addresses all parts of the question
8
+ - Is logically consistent
9
+ - Uses precise and accurate terminology
10
+
11
+ When scoring, you should penalize:
12
+ - Factual errors or inaccuracies
13
+ - Incomplete or partial answers
14
+ - Misleading or ambiguous statements
15
+ - Incorrect terminology
16
+ - Logical inconsistencies
17
+ - Missing key information
18
+ </Rubric>
19
+
20
+ <Instructions>
21
+ - Carefully read the input and output
22
+ - Check for factual accuracy and completeness
23
+ - Focus on correctness of information rather than style or verbosity
24
+ </Instructions>
25
+
26
+ <Reminder>
27
+ The goal is to evaluate factual correctness and completeness of the response.
28
+ </Reminder>
29
+
30
+ <input>
31
+ {inputs}
32
+ </input>
33
+
34
+ <output>
35
+ {outputs}
36
+ </output>
37
+
38
+ If available, you may use the reference outputs below to help you evaluate the correctness of the response:
39
+
40
+ <reference_outputs>
41
+ {reference_outputs}
42
+ </reference_outputs>
43
+ `;
@@ -0,0 +1,46 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.HALLUCINATION_PROMPT = void 0;
4
+ exports.HALLUCINATION_PROMPT = `You are an expert data labeler evaluating model outputs for hallucinations. Your task is to assign a score based on the following rubric:
5
+
6
+ <Rubric>
7
+ A response without hallucinations:
8
+ - Contains only verifiable facts that are directly supported by the input context
9
+ - Makes no unsupported claims or assumptions
10
+ - Does not add speculative or imagined details
11
+ - Maintains perfect accuracy in dates, numbers, and specific details
12
+ - Appropriately indicates uncertainty when information is incomplete
13
+ </Rubric>
14
+
15
+ <Instructions>
16
+ - Read the input context thoroughly
17
+ - Identify all claims made in the output
18
+ - Cross-reference each claim with the input context
19
+ - Note any unsupported or contradictory information
20
+ - Consider the severity and quantity of hallucinations
21
+ </Instructions>
22
+
23
+ <Reminder>
24
+ Focus solely on factual accuracy and support from the input context. Do not consider style, grammar, or presentation in scoring. A shorter, factual response should score higher than a longer response with unsupported claims.
25
+ </Reminder>
26
+
27
+ Use the following context to help you evaluate for hallucinations in the output:
28
+
29
+ <context>
30
+ {context}
31
+ </context>
32
+
33
+ <input>
34
+ {inputs}
35
+ </input>
36
+
37
+ <output>
38
+ {outputs}
39
+ </output>
40
+
41
+ If available, you may also use the reference outputs below to help you identify hallucinations in the response:
42
+
43
+ <reference_outputs>
44
+ {reference_outputs}
45
+ </reference_outputs>
46
+ `;
@@ -0,0 +1 @@
1
+ export declare const HALLUCINATION_PROMPT = "You are an expert data labeler evaluating model outputs for hallucinations. Your task is to assign a score based on the following rubric:\n\n<Rubric>\n A response without hallucinations:\n - Contains only verifiable facts that are directly supported by the input context\n - Makes no unsupported claims or assumptions\n - Does not add speculative or imagined details\n - Maintains perfect accuracy in dates, numbers, and specific details\n - Appropriately indicates uncertainty when information is incomplete\n</Rubric>\n\n<Instructions>\n - Read the input context thoroughly\n - Identify all claims made in the output\n - Cross-reference each claim with the input context\n - Note any unsupported or contradictory information\n - Consider the severity and quantity of hallucinations\n</Instructions>\n\n<Reminder>\n Focus solely on factual accuracy and support from the input context. Do not consider style, grammar, or presentation in scoring. A shorter, factual response should score higher than a longer response with unsupported claims.\n</Reminder>\n\nUse the following context to help you evaluate for hallucinations in the output:\n\n<context>\n{context}\n</context>\n\n<input>\n{inputs}\n</input>\n\n<output>\n{outputs}\n</output>\n\nIf available, you may also use the reference outputs below to help you identify hallucinations in the response:\n\n<reference_outputs>\n{reference_outputs}\n</reference_outputs>\n";
@@ -0,0 +1,43 @@
1
+ export const HALLUCINATION_PROMPT = `You are an expert data labeler evaluating model outputs for hallucinations. Your task is to assign a score based on the following rubric:
2
+
3
+ <Rubric>
4
+ A response without hallucinations:
5
+ - Contains only verifiable facts that are directly supported by the input context
6
+ - Makes no unsupported claims or assumptions
7
+ - Does not add speculative or imagined details
8
+ - Maintains perfect accuracy in dates, numbers, and specific details
9
+ - Appropriately indicates uncertainty when information is incomplete
10
+ </Rubric>
11
+
12
+ <Instructions>
13
+ - Read the input context thoroughly
14
+ - Identify all claims made in the output
15
+ - Cross-reference each claim with the input context
16
+ - Note any unsupported or contradictory information
17
+ - Consider the severity and quantity of hallucinations
18
+ </Instructions>
19
+
20
+ <Reminder>
21
+ Focus solely on factual accuracy and support from the input context. Do not consider style, grammar, or presentation in scoring. A shorter, factual response should score higher than a longer response with unsupported claims.
22
+ </Reminder>
23
+
24
+ Use the following context to help you evaluate for hallucinations in the output:
25
+
26
+ <context>
27
+ {context}
28
+ </context>
29
+
30
+ <input>
31
+ {inputs}
32
+ </input>
33
+
34
+ <output>
35
+ {outputs}
36
+ </output>
37
+
38
+ If available, you may also use the reference outputs below to help you identify hallucinations in the response:
39
+
40
+ <reference_outputs>
41
+ {reference_outputs}
42
+ </reference_outputs>
43
+ `;
@@ -0,0 +1,49 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.createEmbeddingSimilarityEvaluator = void 0;
4
+ const utils_js_1 = require("../utils.cjs");
5
+ // Helper functions for vector calculations
6
+ const handleEmbeddingOutputs = (algorithm, receivedEmbedding, expectedEmbedding) => {
7
+ const dotProduct = (v1, v2) => v1.reduce((sum, a, i) => sum + a * v2[i], 0);
8
+ const vectorMagnitude = (v) => Math.sqrt(v.reduce((sum, x) => sum + x * x, 0));
9
+ const cosineSimilarity = (v1, v2) => {
10
+ const dotProd = dotProduct(v1, v2);
11
+ const magnitude1 = vectorMagnitude(v1);
12
+ const magnitude2 = vectorMagnitude(v2);
13
+ return dotProd / (magnitude1 * magnitude2);
14
+ };
15
+ // Calculate similarity based on chosen algorithm
16
+ const similarity = algorithm === "cosine"
17
+ ? cosineSimilarity(receivedEmbedding, expectedEmbedding)
18
+ : dotProduct(receivedEmbedding, expectedEmbedding);
19
+ return Number(similarity.toFixed(2));
20
+ };
21
+ /**
22
+ * Creates an evaluator that compares the actual output and reference output for similarity by text embedding distance.
23
+ * @param {Object} options - The configuration options
24
+ * @param {Embeddings} options.embeddings - The embeddings model to use for similarity comparison
25
+ * @param {('cosine'|'dot_product')} [options.algorithm='cosine'] - The algorithm to use for embedding similarity
26
+ * @returns An evaluator that returns a score representing the embedding similarity
27
+ */
28
+ const createEmbeddingSimilarityEvaluator = ({ embeddings, algorithm = "cosine", }) => {
29
+ if (algorithm !== "cosine" && algorithm !== "dot_product") {
30
+ throw new Error(`Unsupported algorithm: ${algorithm}. Only 'cosine' and 'dot_product' are supported.`);
31
+ }
32
+ return async (params) => {
33
+ const { outputs, referenceOutputs } = params;
34
+ if (outputs == null || referenceOutputs == null) {
35
+ throw new Error("Embedding similarity requires both outputs and referenceOutputs");
36
+ }
37
+ const outputString = typeof outputs === "string" ? outputs : JSON.stringify(outputs);
38
+ const referenceOutputString = typeof referenceOutputs === "string"
39
+ ? referenceOutputs
40
+ : JSON.stringify(referenceOutputs);
41
+ const getScore = async () => {
42
+ const receivedEmbedding = await embeddings.embedQuery(outputString);
43
+ const expectedEmbedding = await embeddings.embedQuery(referenceOutputString);
44
+ return handleEmbeddingOutputs(algorithm, receivedEmbedding, expectedEmbedding);
45
+ };
46
+ return (0, utils_js_1._runEvaluator)("embedding_similarity", getScore, "embedding_similarity");
47
+ };
48
+ };
49
+ exports.createEmbeddingSimilarityEvaluator = createEmbeddingSimilarityEvaluator;
@@ -0,0 +1,18 @@
1
+ import { EvaluatorResult } from "../types.js";
2
+ import { Embeddings } from "@langchain/core/embeddings";
3
+ interface EmbeddingSimilarityOptions {
4
+ embeddings: Embeddings;
5
+ algorithm?: "cosine" | "dot_product";
6
+ }
7
+ /**
8
+ * Creates an evaluator that compares the actual output and reference output for similarity by text embedding distance.
9
+ * @param {Object} options - The configuration options
10
+ * @param {Embeddings} options.embeddings - The embeddings model to use for similarity comparison
11
+ * @param {('cosine'|'dot_product')} [options.algorithm='cosine'] - The algorithm to use for embedding similarity
12
+ * @returns An evaluator that returns a score representing the embedding similarity
13
+ */
14
+ export declare const createEmbeddingSimilarityEvaluator: ({ embeddings, algorithm, }: EmbeddingSimilarityOptions) => (params: {
15
+ outputs: unknown;
16
+ referenceOutputs: unknown;
17
+ }) => Promise<EvaluatorResult>;
18
+ export {};
@@ -0,0 +1,45 @@
1
+ import { _runEvaluator } from "../utils.js";
2
+ // Helper functions for vector calculations
3
+ const handleEmbeddingOutputs = (algorithm, receivedEmbedding, expectedEmbedding) => {
4
+ const dotProduct = (v1, v2) => v1.reduce((sum, a, i) => sum + a * v2[i], 0);
5
+ const vectorMagnitude = (v) => Math.sqrt(v.reduce((sum, x) => sum + x * x, 0));
6
+ const cosineSimilarity = (v1, v2) => {
7
+ const dotProd = dotProduct(v1, v2);
8
+ const magnitude1 = vectorMagnitude(v1);
9
+ const magnitude2 = vectorMagnitude(v2);
10
+ return dotProd / (magnitude1 * magnitude2);
11
+ };
12
+ // Calculate similarity based on chosen algorithm
13
+ const similarity = algorithm === "cosine"
14
+ ? cosineSimilarity(receivedEmbedding, expectedEmbedding)
15
+ : dotProduct(receivedEmbedding, expectedEmbedding);
16
+ return Number(similarity.toFixed(2));
17
+ };
18
+ /**
19
+ * Creates an evaluator that compares the actual output and reference output for similarity by text embedding distance.
20
+ * @param {Object} options - The configuration options
21
+ * @param {Embeddings} options.embeddings - The embeddings model to use for similarity comparison
22
+ * @param {('cosine'|'dot_product')} [options.algorithm='cosine'] - The algorithm to use for embedding similarity
23
+ * @returns An evaluator that returns a score representing the embedding similarity
24
+ */
25
+ export const createEmbeddingSimilarityEvaluator = ({ embeddings, algorithm = "cosine", }) => {
26
+ if (algorithm !== "cosine" && algorithm !== "dot_product") {
27
+ throw new Error(`Unsupported algorithm: ${algorithm}. Only 'cosine' and 'dot_product' are supported.`);
28
+ }
29
+ return async (params) => {
30
+ const { outputs, referenceOutputs } = params;
31
+ if (outputs == null || referenceOutputs == null) {
32
+ throw new Error("Embedding similarity requires both outputs and referenceOutputs");
33
+ }
34
+ const outputString = typeof outputs === "string" ? outputs : JSON.stringify(outputs);
35
+ const referenceOutputString = typeof referenceOutputs === "string"
36
+ ? referenceOutputs
37
+ : JSON.stringify(referenceOutputs);
38
+ const getScore = async () => {
39
+ const receivedEmbedding = await embeddings.embedQuery(outputString);
40
+ const expectedEmbedding = await embeddings.embedQuery(referenceOutputString);
41
+ return handleEmbeddingOutputs(algorithm, receivedEmbedding, expectedEmbedding);
42
+ };
43
+ return _runEvaluator("embedding_similarity", getScore, "embedding_similarity");
44
+ };
45
+ };