agentevals 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +19 -0
- package/README.md +1 -0
- package/dist/evaluators/exact.cjs +23 -0
- package/dist/evaluators/exact.d.ts +10 -0
- package/dist/evaluators/exact.js +19 -0
- package/dist/evaluators/llm.cjs +284 -0
- package/dist/evaluators/llm.d.ts +73 -0
- package/dist/evaluators/llm.js +279 -0
- package/dist/evaluators/prompts/conciseness.cjs +42 -0
- package/dist/evaluators/prompts/conciseness.d.ts +1 -0
- package/dist/evaluators/prompts/conciseness.js +39 -0
- package/dist/evaluators/prompts/correctness.cjs +46 -0
- package/dist/evaluators/prompts/correctness.d.ts +1 -0
- package/dist/evaluators/prompts/correctness.js +43 -0
- package/dist/evaluators/prompts/hallucination.cjs +46 -0
- package/dist/evaluators/prompts/hallucination.d.ts +1 -0
- package/dist/evaluators/prompts/hallucination.js +43 -0
- package/dist/evaluators/string/embedding_similarity.cjs +49 -0
- package/dist/evaluators/string/embedding_similarity.d.ts +18 -0
- package/dist/evaluators/string/embedding_similarity.js +45 -0
- package/dist/evaluators/string/levenshtein.cjs +57 -0
- package/dist/evaluators/string/levenshtein.d.ts +11 -0
- package/dist/evaluators/string/levenshtein.js +53 -0
- package/dist/evaluators/trajectory/llm.cjs +86 -0
- package/dist/evaluators/trajectory/llm.d.ts +49 -0
- package/dist/evaluators/trajectory/llm.js +82 -0
- package/dist/evaluators/trajectory/strict.cjs +58 -0
- package/dist/evaluators/trajectory/strict.d.ts +10 -0
- package/dist/evaluators/trajectory/strict.js +54 -0
- package/dist/evaluators/trajectory/subset.cjs +32 -0
- package/dist/evaluators/trajectory/subset.d.ts +23 -0
- package/dist/evaluators/trajectory/subset.js +28 -0
- package/dist/evaluators/trajectory/superset.cjs +32 -0
- package/dist/evaluators/trajectory/superset.d.ts +23 -0
- package/dist/evaluators/trajectory/superset.js +28 -0
- package/dist/evaluators/trajectory/unordered.cjs +33 -0
- package/dist/evaluators/trajectory/unordered.d.ts +23 -0
- package/dist/evaluators/trajectory/unordered.js +29 -0
- package/dist/evaluators/trajectory/utils.cjs +68 -0
- package/dist/evaluators/trajectory/utils.d.ts +3 -0
- package/dist/evaluators/trajectory/utils.js +63 -0
- package/dist/evaluators/types.cjs +2 -0
- package/dist/evaluators/types.d.ts +44 -0
- package/dist/evaluators/types.js +1 -0
- package/dist/evaluators/utils.cjs +85 -0
- package/dist/evaluators/utils.d.ts +13 -0
- package/dist/evaluators/utils.js +78 -0
- package/dist/index.cjs +43 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.js +13 -0
- package/index.cjs +1 -0
- package/index.d.cts +1 -0
- package/index.d.ts +1 -0
- package/index.js +1 -0
- package/package.json +60 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) 2025 LangChain, Inc.
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
5
|
+
in the Software without restriction, including without limitation the rights
|
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
8
|
+
furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
19
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# js
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.exactMatch = void 0;
|
|
4
|
+
const utils_js_1 = require("./utils.cjs");
|
|
5
|
+
const _scorer = (params) => {
|
|
6
|
+
const { outputs, referenceOutputs } = params;
|
|
7
|
+
if (outputs === null || referenceOutputs === null) {
|
|
8
|
+
throw new Error("Exact match requires both outputs and referenceOutputs");
|
|
9
|
+
}
|
|
10
|
+
const outputsJson = JSON.stringify(outputs, null, 2);
|
|
11
|
+
const referenceOutputsJson = JSON.stringify(referenceOutputs, null, 2);
|
|
12
|
+
return outputsJson === referenceOutputsJson;
|
|
13
|
+
};
|
|
14
|
+
/**
|
|
15
|
+
* Performs exact matching between input and output values.
|
|
16
|
+
* @param outputs outputs to compare
|
|
17
|
+
* @param referenceOutputs Reference outputs to compare
|
|
18
|
+
* @returns
|
|
19
|
+
*/
|
|
20
|
+
const exactMatch = async (params) => {
|
|
21
|
+
return (0, utils_js_1._runEvaluator)("exact_match", _scorer, "exact_match", params);
|
|
22
|
+
};
|
|
23
|
+
exports.exactMatch = exactMatch;
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Performs exact matching between input and output values.
|
|
3
|
+
* @param outputs outputs to compare
|
|
4
|
+
* @param referenceOutputs Reference outputs to compare
|
|
5
|
+
* @returns
|
|
6
|
+
*/
|
|
7
|
+
export declare const exactMatch: (params: {
|
|
8
|
+
outputs: unknown;
|
|
9
|
+
referenceOutputs: unknown;
|
|
10
|
+
}) => Promise<import("langsmith/vitest").SimpleEvaluationResult>;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { _runEvaluator } from "./utils.js";
|
|
2
|
+
const _scorer = (params) => {
|
|
3
|
+
const { outputs, referenceOutputs } = params;
|
|
4
|
+
if (outputs === null || referenceOutputs === null) {
|
|
5
|
+
throw new Error("Exact match requires both outputs and referenceOutputs");
|
|
6
|
+
}
|
|
7
|
+
const outputsJson = JSON.stringify(outputs, null, 2);
|
|
8
|
+
const referenceOutputsJson = JSON.stringify(referenceOutputs, null, 2);
|
|
9
|
+
return outputsJson === referenceOutputsJson;
|
|
10
|
+
};
|
|
11
|
+
/**
|
|
12
|
+
* Performs exact matching between input and output values.
|
|
13
|
+
* @param outputs outputs to compare
|
|
14
|
+
* @param referenceOutputs Reference outputs to compare
|
|
15
|
+
* @returns
|
|
16
|
+
*/
|
|
17
|
+
export const exactMatch = async (params) => {
|
|
18
|
+
return _runEvaluator("exact_match", _scorer, "exact_match", params);
|
|
19
|
+
};
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.createLLMAsJudge = exports._createLLMAsJudgeScorer = void 0;
|
|
4
|
+
const runnables_1 = require("@langchain/core/runnables");
|
|
5
|
+
const prompts_1 = require("@langchain/core/prompts");
|
|
6
|
+
const universal_1 = require("langchain/chat_models/universal");
|
|
7
|
+
const traceable_1 = require("langsmith/traceable");
|
|
8
|
+
const utils_js_1 = require("./utils.cjs");
|
|
9
|
+
function _isRunnableInterface(prompt) {
|
|
10
|
+
return runnables_1.Runnable.isRunnable(prompt);
|
|
11
|
+
}
|
|
12
|
+
function _isBaseChatModel(x) {
|
|
13
|
+
const model = x;
|
|
14
|
+
return (x != null &&
|
|
15
|
+
typeof x === "object" &&
|
|
16
|
+
typeof model._modelType === "function" &&
|
|
17
|
+
model._modelType() === "base_chat_model");
|
|
18
|
+
}
|
|
19
|
+
function appendFewShotExamples({ messages, fewShotExamples, }) {
|
|
20
|
+
// Find the last user message to append examples to
|
|
21
|
+
const lastUserMessageIdx = messages
|
|
22
|
+
.slice()
|
|
23
|
+
.reverse()
|
|
24
|
+
.findIndex((msg) => msg.role === "user");
|
|
25
|
+
if (lastUserMessageIdx === -1) {
|
|
26
|
+
throw new Error("Appending few-shot examples requires a user message in the provided prompt");
|
|
27
|
+
}
|
|
28
|
+
const actualIdx = messages.length - 1 - lastUserMessageIdx;
|
|
29
|
+
messages[actualIdx].content +=
|
|
30
|
+
"\n\n" +
|
|
31
|
+
fewShotExamples
|
|
32
|
+
.map((example) => {
|
|
33
|
+
let exampleStr = `<example>\n<input>${JSON.stringify(example.inputs)}</input>\n<output>${JSON.stringify(example.outputs)}</output>`;
|
|
34
|
+
if (example.reasoning) {
|
|
35
|
+
exampleStr += `\n<reasoning>${example.reasoning}</reasoning>`;
|
|
36
|
+
}
|
|
37
|
+
if (example.score !== undefined) {
|
|
38
|
+
exampleStr += `\n<score>${example.score}</score>`;
|
|
39
|
+
}
|
|
40
|
+
exampleStr += "\n</example>";
|
|
41
|
+
return exampleStr;
|
|
42
|
+
})
|
|
43
|
+
.join("\n");
|
|
44
|
+
return messages;
|
|
45
|
+
}
|
|
46
|
+
function constructOutputSchema({ schema, continuous, choices, useReasoning, }) {
|
|
47
|
+
const jsonSchema = schema ?? {
|
|
48
|
+
type: "object",
|
|
49
|
+
additionalProperties: false,
|
|
50
|
+
strict: true,
|
|
51
|
+
};
|
|
52
|
+
let description;
|
|
53
|
+
let scoreSchema;
|
|
54
|
+
if (choices) {
|
|
55
|
+
description =
|
|
56
|
+
"A number that represents the degree to which the criteria in the prompt are met.";
|
|
57
|
+
scoreSchema = {
|
|
58
|
+
type: "number",
|
|
59
|
+
description,
|
|
60
|
+
enum: choices,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
else if (continuous) {
|
|
64
|
+
description =
|
|
65
|
+
"A number that represents the degree to which the criteria in the prompt are met, from 0.0 to 1.0. 1.0 means the criteria are met perfectly. 0.0 means none of the criteria are met.";
|
|
66
|
+
scoreSchema = {
|
|
67
|
+
type: "number",
|
|
68
|
+
description,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
else {
|
|
72
|
+
description =
|
|
73
|
+
"A score that is true if criteria in the prompt are met, and false otherwise.";
|
|
74
|
+
scoreSchema = {
|
|
75
|
+
type: "boolean",
|
|
76
|
+
description,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
if (!schema) {
|
|
80
|
+
if (useReasoning) {
|
|
81
|
+
jsonSchema.properties = {
|
|
82
|
+
reasoning: {
|
|
83
|
+
type: "string",
|
|
84
|
+
description: "A human-readable explanation of the score. You MUST end the reasoning with a sentence that says: Thus, the score should be: SCORE_YOU_ASSIGN.",
|
|
85
|
+
},
|
|
86
|
+
score: scoreSchema,
|
|
87
|
+
};
|
|
88
|
+
jsonSchema.required = ["reasoning", "score"];
|
|
89
|
+
}
|
|
90
|
+
else {
|
|
91
|
+
jsonSchema.properties = {
|
|
92
|
+
score: scoreSchema,
|
|
93
|
+
};
|
|
94
|
+
jsonSchema.required = ["score"];
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
return [jsonSchema, description];
|
|
98
|
+
}
|
|
99
|
+
const _createLLMAsJudgeScorer = (params) => {
|
|
100
|
+
const { prompt, system, schema, model, continuous, choices, fewShotExamples, } = params;
|
|
101
|
+
let judge = params.judge;
|
|
102
|
+
const useReasoning = params.useReasoning ?? true;
|
|
103
|
+
const getScore = async (params) => {
|
|
104
|
+
let { inputs, outputs, referenceOutputs, ...rest } = params;
|
|
105
|
+
if (system && typeof prompt !== "string") {
|
|
106
|
+
throw new Error("`system` is only supported when `prompt` is a string template");
|
|
107
|
+
}
|
|
108
|
+
let stringifiedInputs = inputs;
|
|
109
|
+
let stringifiedOutputs = outputs;
|
|
110
|
+
let stringifiedReferenceOutputs = referenceOutputs;
|
|
111
|
+
if (inputs && typeof inputs !== "string") {
|
|
112
|
+
stringifiedInputs = JSON.stringify(inputs);
|
|
113
|
+
}
|
|
114
|
+
if (outputs && typeof outputs !== "string") {
|
|
115
|
+
stringifiedOutputs = JSON.stringify(outputs);
|
|
116
|
+
}
|
|
117
|
+
if (referenceOutputs && typeof referenceOutputs !== "string") {
|
|
118
|
+
stringifiedReferenceOutputs = JSON.stringify(referenceOutputs);
|
|
119
|
+
}
|
|
120
|
+
const stringifiedRest = Object.fromEntries(Object.entries(rest).map(([key, value]) => [
|
|
121
|
+
key,
|
|
122
|
+
typeof value === "string" ? value : JSON.stringify(value),
|
|
123
|
+
]));
|
|
124
|
+
let messages = [];
|
|
125
|
+
if (_isRunnableInterface(prompt)) {
|
|
126
|
+
const formattedPrompt = await prompt.invoke({
|
|
127
|
+
inputs: stringifiedInputs,
|
|
128
|
+
outputs: stringifiedOutputs,
|
|
129
|
+
reference_outputs: stringifiedReferenceOutputs,
|
|
130
|
+
...stringifiedRest,
|
|
131
|
+
});
|
|
132
|
+
messages = formattedPrompt.messages;
|
|
133
|
+
}
|
|
134
|
+
else if (typeof prompt === "string") {
|
|
135
|
+
const template = prompts_1.ChatPromptTemplate.fromTemplate(prompt);
|
|
136
|
+
const formattedPrompt = await template.invoke({
|
|
137
|
+
inputs: stringifiedInputs,
|
|
138
|
+
outputs: stringifiedOutputs,
|
|
139
|
+
reference_outputs: stringifiedReferenceOutputs,
|
|
140
|
+
...stringifiedRest,
|
|
141
|
+
});
|
|
142
|
+
messages = formattedPrompt.messages;
|
|
143
|
+
}
|
|
144
|
+
else {
|
|
145
|
+
messages = await prompt({
|
|
146
|
+
inputs,
|
|
147
|
+
outputs,
|
|
148
|
+
reference_outputs: referenceOutputs,
|
|
149
|
+
...rest,
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
if (system) {
|
|
153
|
+
messages = [{ role: "system", content: system }, ...messages];
|
|
154
|
+
}
|
|
155
|
+
let normalizedMessages = (0, utils_js_1._normalizeToOpenAIMessagesList)(messages);
|
|
156
|
+
if (fewShotExamples) {
|
|
157
|
+
normalizedMessages = appendFewShotExamples({
|
|
158
|
+
messages: normalizedMessages,
|
|
159
|
+
fewShotExamples,
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
const [jsonSchema, description] = constructOutputSchema({
|
|
163
|
+
schema,
|
|
164
|
+
continuous,
|
|
165
|
+
choices,
|
|
166
|
+
useReasoning,
|
|
167
|
+
});
|
|
168
|
+
if (!judge) {
|
|
169
|
+
judge = await (0, universal_1.initChatModel)(model);
|
|
170
|
+
}
|
|
171
|
+
let response;
|
|
172
|
+
if (_isBaseChatModel(judge)) {
|
|
173
|
+
response = await judge
|
|
174
|
+
.withStructuredOutput({
|
|
175
|
+
title: "score",
|
|
176
|
+
description,
|
|
177
|
+
...jsonSchema,
|
|
178
|
+
})
|
|
179
|
+
.invoke(normalizedMessages);
|
|
180
|
+
if (schema === undefined) {
|
|
181
|
+
if (useReasoning) {
|
|
182
|
+
return [response.score, response.reasoning];
|
|
183
|
+
}
|
|
184
|
+
return response.score;
|
|
185
|
+
}
|
|
186
|
+
else {
|
|
187
|
+
return response;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
else {
|
|
191
|
+
if (!model) {
|
|
192
|
+
throw new Error("`model` is required for non-LangChain clients");
|
|
193
|
+
}
|
|
194
|
+
const params = {
|
|
195
|
+
messages: normalizedMessages,
|
|
196
|
+
model,
|
|
197
|
+
response_format: {
|
|
198
|
+
type: "json_schema",
|
|
199
|
+
json_schema: {
|
|
200
|
+
name: "score",
|
|
201
|
+
strict: true,
|
|
202
|
+
schema: jsonSchema,
|
|
203
|
+
},
|
|
204
|
+
},
|
|
205
|
+
};
|
|
206
|
+
const invokeLlm = (0, traceable_1.traceable)(judge.chat.completions.create.bind(judge.chat.completions), {
|
|
207
|
+
metadata: {
|
|
208
|
+
ls_provider: "openai",
|
|
209
|
+
ls_model_name: model,
|
|
210
|
+
ls_model_type: "chat",
|
|
211
|
+
},
|
|
212
|
+
run_type: "llm",
|
|
213
|
+
name: "OpenAI Chat Completion",
|
|
214
|
+
});
|
|
215
|
+
const response = await invokeLlm(params);
|
|
216
|
+
const parsed = JSON.parse(response.choices[0].message.content);
|
|
217
|
+
if (schema === undefined) {
|
|
218
|
+
if (useReasoning) {
|
|
219
|
+
return [parsed.score, parsed.reasoning];
|
|
220
|
+
}
|
|
221
|
+
return parsed.score;
|
|
222
|
+
}
|
|
223
|
+
return parsed;
|
|
224
|
+
}
|
|
225
|
+
};
|
|
226
|
+
return getScore;
|
|
227
|
+
};
|
|
228
|
+
exports._createLLMAsJudgeScorer = _createLLMAsJudgeScorer;
|
|
229
|
+
/**
|
|
230
|
+
* Create an evaluator that uses an LLM to assess output quality based on specified criteria.
|
|
231
|
+
*
|
|
232
|
+
* @param params Configuration object with the following properties:
|
|
233
|
+
* @param params.prompt The evaluation prompt - can be a string template, LangChain prompt template,
|
|
234
|
+
* or function that returns a list of chat messages
|
|
235
|
+
* @param params.feedbackKey Key used to store the evaluation result, defaults to "score"
|
|
236
|
+
* @param params.judge The LLM used for evaluation. Can be an OpenAI client or a LangChain model.
|
|
237
|
+
* If using OpenAI client, must specify "model" parameter.
|
|
238
|
+
* If omitted, "model" will be used to instantiate a LangChain model instance.
|
|
239
|
+
* @param params.model Model identifier to use. Defaults to "openai:o3-mini".
|
|
240
|
+
* If "judge" is an OpenAI client, this should be a model name directly.
|
|
241
|
+
* If "judge" is omitted, must be a valid LangChain model identifier.
|
|
242
|
+
* @param params.system Optional system message to prepend to the prompt
|
|
243
|
+
* @param params.continuous If true, score will be a float between 0 and 1.
|
|
244
|
+
* If false, score will be boolean. Defaults to false.
|
|
245
|
+
* @param params.choices Optional list of specific float values the score must be chosen from
|
|
246
|
+
* @param params.useReasoning If true, includes explanation for the score in the output.
|
|
247
|
+
* Defaults to true.
|
|
248
|
+
* @param params.fewShotExamples Optional list of example evaluations to append to the prompt
|
|
249
|
+
*
|
|
250
|
+
* @returns A function that takes inputs, outputs, reference_outputs, and other kwargs,
|
|
251
|
+
* formats them into a prompt, invokes the judge, and returns an evaluation result
|
|
252
|
+
*
|
|
253
|
+
* @example
|
|
254
|
+
* ```typescript
|
|
255
|
+
* import { createLLMAsJudge } from "openevals";
|
|
256
|
+
*
|
|
257
|
+
* const evaluator = createLLMAsJudge({
|
|
258
|
+
* prompt: "Rate the quality of this response from 0 to 1: {outputs}",
|
|
259
|
+
* continuous: true,
|
|
260
|
+
* });
|
|
261
|
+
* const result = await evaluator({
|
|
262
|
+
* inputs: { question: "What color is the sky?" },
|
|
263
|
+
* outputs: { response: "Blue" },
|
|
264
|
+
* });
|
|
265
|
+
* ```
|
|
266
|
+
*/
|
|
267
|
+
const createLLMAsJudge = ({ prompt, feedbackKey = "score", model = "openai:o3-mini", system, judge, continuous = false, choices, useReasoning = true, fewShotExamples, }) => {
|
|
268
|
+
const scorer = (0, exports._createLLMAsJudgeScorer)({
|
|
269
|
+
prompt,
|
|
270
|
+
judge,
|
|
271
|
+
model,
|
|
272
|
+
system,
|
|
273
|
+
continuous,
|
|
274
|
+
choices,
|
|
275
|
+
useReasoning,
|
|
276
|
+
fewShotExamples,
|
|
277
|
+
});
|
|
278
|
+
const _wrappedEvaluator = async (inputs) => {
|
|
279
|
+
const runName = feedbackKey !== "score" ? "llm_as_judge" : `llm_as_${feedbackKey}_judge`;
|
|
280
|
+
return (0, utils_js_1._runEvaluator)(runName, scorer, feedbackKey, inputs);
|
|
281
|
+
};
|
|
282
|
+
return _wrappedEvaluator;
|
|
283
|
+
};
|
|
284
|
+
exports.createLLMAsJudge = createLLMAsJudge;
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import { RunnableInterface } from "@langchain/core/runnables";
|
|
2
|
+
import { BaseChatModel } from "@langchain/core/language_models/chat_models";
|
|
3
|
+
import { ChatCompletionMessage, FewShotExample, ModelClient, SingleResultScorerReturnType } from "./types.js";
|
|
4
|
+
export declare const _createLLMAsJudgeScorer: (params: {
|
|
5
|
+
prompt: string | RunnableInterface<any, any, import("@langchain/core/runnables").RunnableConfig<Record<string, any>>> | ((...args: unknown[]) => ChatCompletionMessage[] | Promise<ChatCompletionMessage[]>);
|
|
6
|
+
system?: string | undefined;
|
|
7
|
+
schema?: Record<string, unknown> | undefined;
|
|
8
|
+
judge?: ModelClient | BaseChatModel<import("@langchain/core/language_models/chat_models").BaseChatModelCallOptions, import("@langchain/core/messages").AIMessageChunk> | undefined;
|
|
9
|
+
model?: string | undefined;
|
|
10
|
+
continuous?: boolean | undefined;
|
|
11
|
+
choices?: number[] | undefined;
|
|
12
|
+
useReasoning?: boolean | undefined;
|
|
13
|
+
fewShotExamples?: FewShotExample[] | undefined;
|
|
14
|
+
}) => (params: {
|
|
15
|
+
[key: string]: unknown;
|
|
16
|
+
inputs: unknown;
|
|
17
|
+
outputs: unknown;
|
|
18
|
+
referenceOutputs?: unknown;
|
|
19
|
+
}) => Promise<SingleResultScorerReturnType>;
|
|
20
|
+
/**
|
|
21
|
+
* Create an evaluator that uses an LLM to assess output quality based on specified criteria.
|
|
22
|
+
*
|
|
23
|
+
* @param params Configuration object with the following properties:
|
|
24
|
+
* @param params.prompt The evaluation prompt - can be a string template, LangChain prompt template,
|
|
25
|
+
* or function that returns a list of chat messages
|
|
26
|
+
* @param params.feedbackKey Key used to store the evaluation result, defaults to "score"
|
|
27
|
+
* @param params.judge The LLM used for evaluation. Can be an OpenAI client or a LangChain model.
|
|
28
|
+
* If using OpenAI client, must specify "model" parameter.
|
|
29
|
+
* If omitted, "model" will be used to instantiate a LangChain model instance.
|
|
30
|
+
* @param params.model Model identifier to use. Defaults to "openai:o3-mini".
|
|
31
|
+
* If "judge" is an OpenAI client, this should be a model name directly.
|
|
32
|
+
* If "judge" is omitted, must be a valid LangChain model identifier.
|
|
33
|
+
* @param params.system Optional system message to prepend to the prompt
|
|
34
|
+
* @param params.continuous If true, score will be a float between 0 and 1.
|
|
35
|
+
* If false, score will be boolean. Defaults to false.
|
|
36
|
+
* @param params.choices Optional list of specific float values the score must be chosen from
|
|
37
|
+
* @param params.useReasoning If true, includes explanation for the score in the output.
|
|
38
|
+
* Defaults to true.
|
|
39
|
+
* @param params.fewShotExamples Optional list of example evaluations to append to the prompt
|
|
40
|
+
*
|
|
41
|
+
* @returns A function that takes inputs, outputs, reference_outputs, and other kwargs,
|
|
42
|
+
* formats them into a prompt, invokes the judge, and returns an evaluation result
|
|
43
|
+
*
|
|
44
|
+
* @example
|
|
45
|
+
* ```typescript
|
|
46
|
+
* import { createLLMAsJudge } from "openevals";
|
|
47
|
+
*
|
|
48
|
+
* const evaluator = createLLMAsJudge({
|
|
49
|
+
* prompt: "Rate the quality of this response from 0 to 1: {outputs}",
|
|
50
|
+
* continuous: true,
|
|
51
|
+
* });
|
|
52
|
+
* const result = await evaluator({
|
|
53
|
+
* inputs: { question: "What color is the sky?" },
|
|
54
|
+
* outputs: { response: "Blue" },
|
|
55
|
+
* });
|
|
56
|
+
* ```
|
|
57
|
+
*/
|
|
58
|
+
export declare const createLLMAsJudge: ({ prompt, feedbackKey, model, system, judge, continuous, choices, useReasoning, fewShotExamples, }: {
|
|
59
|
+
prompt: string | RunnableInterface<any, any, import("@langchain/core/runnables").RunnableConfig<Record<string, any>>> | ((...args: unknown[]) => ChatCompletionMessage[] | Promise<ChatCompletionMessage[]>);
|
|
60
|
+
feedbackKey?: string | undefined;
|
|
61
|
+
model?: string | undefined;
|
|
62
|
+
system?: string | undefined;
|
|
63
|
+
judge?: ModelClient | BaseChatModel<import("@langchain/core/language_models/chat_models").BaseChatModelCallOptions, import("@langchain/core/messages").AIMessageChunk> | undefined;
|
|
64
|
+
continuous?: boolean | undefined;
|
|
65
|
+
choices?: number[] | undefined;
|
|
66
|
+
useReasoning?: boolean | undefined;
|
|
67
|
+
fewShotExamples?: FewShotExample[] | undefined;
|
|
68
|
+
}) => (inputs: {
|
|
69
|
+
[key: string]: unknown;
|
|
70
|
+
inputs: unknown;
|
|
71
|
+
outputs: unknown;
|
|
72
|
+
referenceOutputs?: unknown;
|
|
73
|
+
}) => Promise<import("langsmith/vitest").SimpleEvaluationResult>;
|