agentevals 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +19 -0
- package/README.md +1 -0
- package/dist/evaluators/exact.cjs +23 -0
- package/dist/evaluators/exact.d.ts +10 -0
- package/dist/evaluators/exact.js +19 -0
- package/dist/evaluators/llm.cjs +284 -0
- package/dist/evaluators/llm.d.ts +73 -0
- package/dist/evaluators/llm.js +279 -0
- package/dist/evaluators/prompts/conciseness.cjs +42 -0
- package/dist/evaluators/prompts/conciseness.d.ts +1 -0
- package/dist/evaluators/prompts/conciseness.js +39 -0
- package/dist/evaluators/prompts/correctness.cjs +46 -0
- package/dist/evaluators/prompts/correctness.d.ts +1 -0
- package/dist/evaluators/prompts/correctness.js +43 -0
- package/dist/evaluators/prompts/hallucination.cjs +46 -0
- package/dist/evaluators/prompts/hallucination.d.ts +1 -0
- package/dist/evaluators/prompts/hallucination.js +43 -0
- package/dist/evaluators/string/embedding_similarity.cjs +49 -0
- package/dist/evaluators/string/embedding_similarity.d.ts +18 -0
- package/dist/evaluators/string/embedding_similarity.js +45 -0
- package/dist/evaluators/string/levenshtein.cjs +57 -0
- package/dist/evaluators/string/levenshtein.d.ts +11 -0
- package/dist/evaluators/string/levenshtein.js +53 -0
- package/dist/evaluators/trajectory/llm.cjs +86 -0
- package/dist/evaluators/trajectory/llm.d.ts +49 -0
- package/dist/evaluators/trajectory/llm.js +82 -0
- package/dist/evaluators/trajectory/strict.cjs +58 -0
- package/dist/evaluators/trajectory/strict.d.ts +10 -0
- package/dist/evaluators/trajectory/strict.js +54 -0
- package/dist/evaluators/trajectory/subset.cjs +32 -0
- package/dist/evaluators/trajectory/subset.d.ts +23 -0
- package/dist/evaluators/trajectory/subset.js +28 -0
- package/dist/evaluators/trajectory/superset.cjs +32 -0
- package/dist/evaluators/trajectory/superset.d.ts +23 -0
- package/dist/evaluators/trajectory/superset.js +28 -0
- package/dist/evaluators/trajectory/unordered.cjs +33 -0
- package/dist/evaluators/trajectory/unordered.d.ts +23 -0
- package/dist/evaluators/trajectory/unordered.js +29 -0
- package/dist/evaluators/trajectory/utils.cjs +68 -0
- package/dist/evaluators/trajectory/utils.d.ts +3 -0
- package/dist/evaluators/trajectory/utils.js +63 -0
- package/dist/evaluators/types.cjs +2 -0
- package/dist/evaluators/types.d.ts +44 -0
- package/dist/evaluators/types.js +1 -0
- package/dist/evaluators/utils.cjs +85 -0
- package/dist/evaluators/utils.d.ts +13 -0
- package/dist/evaluators/utils.js +78 -0
- package/dist/index.cjs +43 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.js +13 -0
- package/index.cjs +1 -0
- package/index.d.cts +1 -0
- package/index.d.ts +1 -0
- package/index.js +1 -0
- package/package.json +60 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.trajectoryUnorderedMatch = void 0;
|
|
4
|
+
const utils_js_1 = require("../utils.cjs");
|
|
5
|
+
const utils_js_2 = require("./utils.cjs");
|
|
6
|
+
/**
|
|
7
|
+
* Evaluate whether an input agent trajectory and called tools contains all the tools used in a reference trajectory.
|
|
8
|
+
* This accounts for some differences in an LLM's reasoning process in a case-by-case basis.
|
|
9
|
+
*
|
|
10
|
+
* @param params - The parameters for trajectory unordered match evaluation
|
|
11
|
+
* @param params.outputs - Actual trajectory the agent followed.
|
|
12
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
13
|
+
* a "messages" key with one of the above.
|
|
14
|
+
* @param params.reference_outputs - Ideal reference trajectory the agent should have followed.
|
|
15
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
16
|
+
* a "messages" key with one of the above.
|
|
17
|
+
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
18
|
+
*/
|
|
19
|
+
async function trajectoryUnorderedMatch(params) {
|
|
20
|
+
const { outputs, referenceOutputs } = params;
|
|
21
|
+
const outputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(outputs);
|
|
22
|
+
const referenceOutputsList = (0, utils_js_1._normalizeToOpenAIMessagesList)(referenceOutputs);
|
|
23
|
+
const getScore = async () => {
|
|
24
|
+
if (outputsList == null || referenceOutputsList == null) {
|
|
25
|
+
throw new Error("Trajectory unordered match requires both outputs and reference_outputs");
|
|
26
|
+
}
|
|
27
|
+
const unorderedMatch = (0, utils_js_2._isTrajectorySuperset)(outputsList, referenceOutputsList) &&
|
|
28
|
+
(0, utils_js_2._isTrajectorySuperset)(referenceOutputsList, outputsList);
|
|
29
|
+
return unorderedMatch;
|
|
30
|
+
};
|
|
31
|
+
return (0, utils_js_1._runEvaluator)("trajectory_unordered_match", getScore, "trajectory_unordered_match", params);
|
|
32
|
+
}
|
|
33
|
+
exports.trajectoryUnorderedMatch = trajectoryUnorderedMatch;
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { BaseMessage } from "@langchain/core/messages";
|
|
2
|
+
import { ChatCompletionMessage, EvaluatorResult } from "../types.js";
|
|
3
|
+
/**
|
|
4
|
+
* Evaluate whether an input agent trajectory and called tools contains all the tools used in a reference trajectory.
|
|
5
|
+
* This accounts for some differences in an LLM's reasoning process in a case-by-case basis.
|
|
6
|
+
*
|
|
7
|
+
* @param params - The parameters for trajectory unordered match evaluation
|
|
8
|
+
* @param params.outputs - Actual trajectory the agent followed.
|
|
9
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
10
|
+
* a "messages" key with one of the above.
|
|
11
|
+
* @param params.reference_outputs - Ideal reference trajectory the agent should have followed.
|
|
12
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
13
|
+
* a "messages" key with one of the above.
|
|
14
|
+
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
15
|
+
*/
|
|
16
|
+
export declare function trajectoryUnorderedMatch(params: {
|
|
17
|
+
outputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
18
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
19
|
+
};
|
|
20
|
+
referenceOutputs: ChatCompletionMessage[] | BaseMessage[] | {
|
|
21
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
22
|
+
};
|
|
23
|
+
}): Promise<EvaluatorResult>;
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js";
|
|
2
|
+
import { _isTrajectorySuperset } from "./utils.js";
|
|
3
|
+
/**
|
|
4
|
+
* Evaluate whether an input agent trajectory and called tools contains all the tools used in a reference trajectory.
|
|
5
|
+
* This accounts for some differences in an LLM's reasoning process in a case-by-case basis.
|
|
6
|
+
*
|
|
7
|
+
* @param params - The parameters for trajectory unordered match evaluation
|
|
8
|
+
* @param params.outputs - Actual trajectory the agent followed.
|
|
9
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
10
|
+
* a "messages" key with one of the above.
|
|
11
|
+
* @param params.reference_outputs - Ideal reference trajectory the agent should have followed.
|
|
12
|
+
* May be a list of OpenAI messages, a list of LangChain messages, or a dictionary containing
|
|
13
|
+
* a "messages" key with one of the above.
|
|
14
|
+
* @returns EvaluatorResult containing a score of true if trajectory (including called tools) matches, false otherwise
|
|
15
|
+
*/
|
|
16
|
+
export async function trajectoryUnorderedMatch(params) {
|
|
17
|
+
const { outputs, referenceOutputs } = params;
|
|
18
|
+
const outputsList = _normalizeToOpenAIMessagesList(outputs);
|
|
19
|
+
const referenceOutputsList = _normalizeToOpenAIMessagesList(referenceOutputs);
|
|
20
|
+
const getScore = async () => {
|
|
21
|
+
if (outputsList == null || referenceOutputsList == null) {
|
|
22
|
+
throw new Error("Trajectory unordered match requires both outputs and reference_outputs");
|
|
23
|
+
}
|
|
24
|
+
const unorderedMatch = _isTrajectorySuperset(outputsList, referenceOutputsList) &&
|
|
25
|
+
_isTrajectorySuperset(referenceOutputsList, outputsList);
|
|
26
|
+
return unorderedMatch;
|
|
27
|
+
};
|
|
28
|
+
return _runEvaluator("trajectory_unordered_match", getScore, "trajectory_unordered_match", params);
|
|
29
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports._chatCompletionMessagesToString = exports._isTrajectorySuperset = void 0;
|
|
4
|
+
function _normalizeToolCall(toolCall) {
|
|
5
|
+
if ("function" in toolCall &&
|
|
6
|
+
toolCall.function != null &&
|
|
7
|
+
typeof toolCall.function === "object") {
|
|
8
|
+
return {
|
|
9
|
+
name: toolCall.function.name,
|
|
10
|
+
args: toolCall.function.arguments,
|
|
11
|
+
};
|
|
12
|
+
}
|
|
13
|
+
return toolCall;
|
|
14
|
+
}
|
|
15
|
+
function _extractToolCalls(messages) {
|
|
16
|
+
const toolCalls = [];
|
|
17
|
+
for (const message of messages) {
|
|
18
|
+
if (message.tool_calls) {
|
|
19
|
+
toolCalls.push(...message.tool_calls.map(_normalizeToolCall));
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
return toolCalls;
|
|
23
|
+
}
|
|
24
|
+
function _isTrajectorySuperset(outputs, referenceOutputs) {
|
|
25
|
+
const outputToolCalls = _extractToolCalls(outputs);
|
|
26
|
+
const referenceToolCalls = _extractToolCalls(referenceOutputs);
|
|
27
|
+
const outputToolCounts = new Map();
|
|
28
|
+
const referenceToolCounts = new Map();
|
|
29
|
+
for (const call of outputToolCalls) {
|
|
30
|
+
outputToolCounts.set(call.name, (outputToolCounts.get(call.name) ?? 0) + 1);
|
|
31
|
+
}
|
|
32
|
+
for (const call of referenceToolCalls) {
|
|
33
|
+
referenceToolCounts.set(call.name, (referenceToolCounts.get(call.name) ?? 0) + 1);
|
|
34
|
+
}
|
|
35
|
+
const allTools = new Set([
|
|
36
|
+
...outputToolCounts.keys(),
|
|
37
|
+
...referenceToolCounts.keys(),
|
|
38
|
+
]);
|
|
39
|
+
for (const name of allTools) {
|
|
40
|
+
if ((outputToolCounts.get(name) ?? 0) < (referenceToolCounts.get(name) ?? 0)) {
|
|
41
|
+
return false;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
return true;
|
|
45
|
+
}
|
|
46
|
+
exports._isTrajectorySuperset = _isTrajectorySuperset;
|
|
47
|
+
function _chatCompletionMessagesToString(messages) {
|
|
48
|
+
function formatMessage(message) {
|
|
49
|
+
let content = message.content ?? "";
|
|
50
|
+
// Handle tool/function calls
|
|
51
|
+
if (message.tool_calls) {
|
|
52
|
+
const toolCallsStr = message.tool_calls
|
|
53
|
+
.map((call) => {
|
|
54
|
+
const func = call.function ?? {};
|
|
55
|
+
return `<tool_call>\n<name>${func.name ?? ""}</name>\n<arguments>${func.arguments ?? ""}</arguments>\n</tool_call>`;
|
|
56
|
+
})
|
|
57
|
+
.join("\n");
|
|
58
|
+
content = content ? `${content}\n${toolCallsStr}` : toolCallsStr;
|
|
59
|
+
}
|
|
60
|
+
// Handle tool call results
|
|
61
|
+
if (message.tool_call_id) {
|
|
62
|
+
content = `<tool_result>\n<id>${message.tool_call_id}</id>\n<content>${content}</content>\n</tool_result>`;
|
|
63
|
+
}
|
|
64
|
+
return `<${message.role ?? ""}>\n${content}\n</${message.role ?? ""}>`;
|
|
65
|
+
}
|
|
66
|
+
return messages.map(formatMessage).join("\n\n");
|
|
67
|
+
}
|
|
68
|
+
exports._chatCompletionMessagesToString = _chatCompletionMessagesToString;
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
import { ChatCompletionMessage } from "../types.js";
|
|
2
|
+
export declare function _isTrajectorySuperset(outputs: ChatCompletionMessage[], referenceOutputs: ChatCompletionMessage[]): boolean;
|
|
3
|
+
export declare function _chatCompletionMessagesToString(messages: ChatCompletionMessage[]): string;
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
function _normalizeToolCall(toolCall) {
|
|
2
|
+
if ("function" in toolCall &&
|
|
3
|
+
toolCall.function != null &&
|
|
4
|
+
typeof toolCall.function === "object") {
|
|
5
|
+
return {
|
|
6
|
+
name: toolCall.function.name,
|
|
7
|
+
args: toolCall.function.arguments,
|
|
8
|
+
};
|
|
9
|
+
}
|
|
10
|
+
return toolCall;
|
|
11
|
+
}
|
|
12
|
+
function _extractToolCalls(messages) {
|
|
13
|
+
const toolCalls = [];
|
|
14
|
+
for (const message of messages) {
|
|
15
|
+
if (message.tool_calls) {
|
|
16
|
+
toolCalls.push(...message.tool_calls.map(_normalizeToolCall));
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
return toolCalls;
|
|
20
|
+
}
|
|
21
|
+
export function _isTrajectorySuperset(outputs, referenceOutputs) {
|
|
22
|
+
const outputToolCalls = _extractToolCalls(outputs);
|
|
23
|
+
const referenceToolCalls = _extractToolCalls(referenceOutputs);
|
|
24
|
+
const outputToolCounts = new Map();
|
|
25
|
+
const referenceToolCounts = new Map();
|
|
26
|
+
for (const call of outputToolCalls) {
|
|
27
|
+
outputToolCounts.set(call.name, (outputToolCounts.get(call.name) ?? 0) + 1);
|
|
28
|
+
}
|
|
29
|
+
for (const call of referenceToolCalls) {
|
|
30
|
+
referenceToolCounts.set(call.name, (referenceToolCounts.get(call.name) ?? 0) + 1);
|
|
31
|
+
}
|
|
32
|
+
const allTools = new Set([
|
|
33
|
+
...outputToolCounts.keys(),
|
|
34
|
+
...referenceToolCounts.keys(),
|
|
35
|
+
]);
|
|
36
|
+
for (const name of allTools) {
|
|
37
|
+
if ((outputToolCounts.get(name) ?? 0) < (referenceToolCounts.get(name) ?? 0)) {
|
|
38
|
+
return false;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
return true;
|
|
42
|
+
}
|
|
43
|
+
export function _chatCompletionMessagesToString(messages) {
|
|
44
|
+
function formatMessage(message) {
|
|
45
|
+
let content = message.content ?? "";
|
|
46
|
+
// Handle tool/function calls
|
|
47
|
+
if (message.tool_calls) {
|
|
48
|
+
const toolCallsStr = message.tool_calls
|
|
49
|
+
.map((call) => {
|
|
50
|
+
const func = call.function ?? {};
|
|
51
|
+
return `<tool_call>\n<name>${func.name ?? ""}</name>\n<arguments>${func.arguments ?? ""}</arguments>\n</tool_call>`;
|
|
52
|
+
})
|
|
53
|
+
.join("\n");
|
|
54
|
+
content = content ? `${content}\n${toolCallsStr}` : toolCallsStr;
|
|
55
|
+
}
|
|
56
|
+
// Handle tool call results
|
|
57
|
+
if (message.tool_call_id) {
|
|
58
|
+
content = `<tool_result>\n<id>${message.tool_call_id}</id>\n<content>${content}</content>\n</tool_result>`;
|
|
59
|
+
}
|
|
60
|
+
return `<${message.role ?? ""}>\n${content}\n</${message.role ?? ""}>`;
|
|
61
|
+
}
|
|
62
|
+
return messages.map(formatMessage).join("\n\n");
|
|
63
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
export type EvaluatorResult = {
|
|
2
|
+
key: string;
|
|
3
|
+
score: number | boolean;
|
|
4
|
+
comment?: string;
|
|
5
|
+
};
|
|
6
|
+
export type SimpleEvaluator = (params: {
|
|
7
|
+
inputs?: unknown;
|
|
8
|
+
outputs: unknown;
|
|
9
|
+
reference_outputs?: unknown;
|
|
10
|
+
[key: string]: unknown;
|
|
11
|
+
}) => Promise<EvaluatorResult | EvaluatorResult[]> | EvaluatorResult | EvaluatorResult[];
|
|
12
|
+
export type SingleResultScorerReturnType = boolean | number | [boolean | number, string];
|
|
13
|
+
export type MultiResultScorerReturnType = {
|
|
14
|
+
[key: string]: boolean | number | {
|
|
15
|
+
score: boolean | number;
|
|
16
|
+
reasoning?: string;
|
|
17
|
+
};
|
|
18
|
+
};
|
|
19
|
+
export type ChatCompletionMessage = {
|
|
20
|
+
content: any;
|
|
21
|
+
role: string;
|
|
22
|
+
tool_calls?: Record<string, any>[];
|
|
23
|
+
tool_call_id?: string;
|
|
24
|
+
};
|
|
25
|
+
export type ChatCompletion = {
|
|
26
|
+
choices: {
|
|
27
|
+
message: ChatCompletionMessage;
|
|
28
|
+
}[];
|
|
29
|
+
};
|
|
30
|
+
export type FewShotExample = {
|
|
31
|
+
inputs: unknown;
|
|
32
|
+
outputs: unknown;
|
|
33
|
+
score: number | boolean;
|
|
34
|
+
reasoning?: string;
|
|
35
|
+
};
|
|
36
|
+
export interface ChatCompletionsClient {
|
|
37
|
+
create(params: Record<string, any>): Promise<ChatCompletion>;
|
|
38
|
+
}
|
|
39
|
+
export interface ModelChatClient {
|
|
40
|
+
completions: ChatCompletionsClient;
|
|
41
|
+
}
|
|
42
|
+
export interface ModelClient {
|
|
43
|
+
chat: ModelChatClient;
|
|
44
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports._runEvaluator = exports.processScore = exports._normalizeToOpenAIMessagesList = exports._convertToOpenAIMessage = void 0;
|
|
4
|
+
const messages_1 = require("@langchain/core/messages");
|
|
5
|
+
const openai_1 = require("@langchain/openai");
|
|
6
|
+
const jestlike_1 = require("langsmith/utils/jestlike");
|
|
7
|
+
const _convertToOpenAIMessage = (message) => {
|
|
8
|
+
if ((0, messages_1.isBaseMessage)(message)) {
|
|
9
|
+
return (0, openai_1._convertMessagesToOpenAIParams)([message])[0];
|
|
10
|
+
}
|
|
11
|
+
else {
|
|
12
|
+
return message;
|
|
13
|
+
}
|
|
14
|
+
};
|
|
15
|
+
exports._convertToOpenAIMessage = _convertToOpenAIMessage;
|
|
16
|
+
const _normalizeToOpenAIMessagesList = (messages) => {
|
|
17
|
+
let messagesList;
|
|
18
|
+
if (!Array.isArray(messages)) {
|
|
19
|
+
if ("messages" in messages && Array.isArray(messages.messages)) {
|
|
20
|
+
messagesList = messages.messages;
|
|
21
|
+
}
|
|
22
|
+
else {
|
|
23
|
+
throw new Error(`If passing messages as an object, it must contain a "messages" key`);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
else {
|
|
27
|
+
messagesList = messages;
|
|
28
|
+
}
|
|
29
|
+
return messagesList.map(exports._convertToOpenAIMessage);
|
|
30
|
+
};
|
|
31
|
+
exports._normalizeToOpenAIMessagesList = _normalizeToOpenAIMessagesList;
|
|
32
|
+
const processScore = (_, value) => {
|
|
33
|
+
if (typeof value === "object") {
|
|
34
|
+
if (value != null && "score" in value) {
|
|
35
|
+
return [
|
|
36
|
+
value.score,
|
|
37
|
+
"reasoning" in value && typeof value.reasoning === "string"
|
|
38
|
+
? value.reasoning
|
|
39
|
+
: undefined,
|
|
40
|
+
];
|
|
41
|
+
}
|
|
42
|
+
else {
|
|
43
|
+
throw new Error(`Expected a dictionary with a "score" key, but got "${JSON.stringify(value, null, 2)}"`);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
return [value];
|
|
47
|
+
};
|
|
48
|
+
exports.processScore = processScore;
|
|
49
|
+
const _runEvaluator = async (runName, scorer, feedbackKey, extra) => {
|
|
50
|
+
const runScorer = async (params) => {
|
|
51
|
+
let score = await scorer(params);
|
|
52
|
+
let reasoning;
|
|
53
|
+
const results = [];
|
|
54
|
+
if (!Array.isArray(score) && typeof score === "object") {
|
|
55
|
+
for (const [key, value] of Object.entries(score)) {
|
|
56
|
+
const [keyScore, reasoning] = (0, exports.processScore)(key, value);
|
|
57
|
+
results.push({ key, score: keyScore, comment: reasoning });
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
else {
|
|
61
|
+
if (Array.isArray(score)) {
|
|
62
|
+
reasoning = score[1];
|
|
63
|
+
score = score[0];
|
|
64
|
+
}
|
|
65
|
+
results.push({ key: feedbackKey, score, comment: reasoning });
|
|
66
|
+
}
|
|
67
|
+
if (results.length === 1) {
|
|
68
|
+
return results[0];
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
return results;
|
|
72
|
+
}
|
|
73
|
+
};
|
|
74
|
+
if ((0, jestlike_1.isInTestContext)()) {
|
|
75
|
+
const res = await (0, jestlike_1.wrapEvaluator)(runScorer)(extra ?? {}, {
|
|
76
|
+
name: runName,
|
|
77
|
+
});
|
|
78
|
+
return res;
|
|
79
|
+
}
|
|
80
|
+
else {
|
|
81
|
+
const res = await runScorer(extra ?? {});
|
|
82
|
+
return res;
|
|
83
|
+
}
|
|
84
|
+
};
|
|
85
|
+
exports._runEvaluator = _runEvaluator;
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { BaseMessage } from "@langchain/core/messages";
|
|
2
|
+
import { SimpleEvaluationResult } from "langsmith/utils/jestlike";
|
|
3
|
+
import { ChatCompletionMessage, MultiResultScorerReturnType, SingleResultScorerReturnType } from "./types.js";
|
|
4
|
+
export declare const _convertToOpenAIMessage: (message: BaseMessage | ChatCompletionMessage) => ChatCompletionMessage;
|
|
5
|
+
export declare const _normalizeToOpenAIMessagesList: (messages: (ChatCompletionMessage | BaseMessage)[] | {
|
|
6
|
+
messages: (BaseMessage | ChatCompletionMessage)[];
|
|
7
|
+
}) => ChatCompletionMessage[];
|
|
8
|
+
export declare const processScore: (_: string, value: boolean | number | {
|
|
9
|
+
score: boolean | number;
|
|
10
|
+
reasoning?: string;
|
|
11
|
+
}) => readonly [number | boolean, string | undefined] | readonly [number | boolean];
|
|
12
|
+
export type EvaluationResultType<O> = O extends MultiResultScorerReturnType ? SimpleEvaluationResult[] : SimpleEvaluationResult;
|
|
13
|
+
export declare const _runEvaluator: <T extends Record<string, unknown>, O extends SingleResultScorerReturnType | MultiResultScorerReturnType | Promise<SingleResultScorerReturnType | MultiResultScorerReturnType>>(runName: string, scorer: (params: T) => O, feedbackKey: string, extra?: T | undefined) => Promise<EvaluationResultType<O>>;
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { isBaseMessage } from "@langchain/core/messages";
|
|
2
|
+
import { _convertMessagesToOpenAIParams } from "@langchain/openai";
|
|
3
|
+
import { wrapEvaluator, isInTestContext, } from "langsmith/utils/jestlike";
|
|
4
|
+
export const _convertToOpenAIMessage = (message) => {
|
|
5
|
+
if (isBaseMessage(message)) {
|
|
6
|
+
return _convertMessagesToOpenAIParams([message])[0];
|
|
7
|
+
}
|
|
8
|
+
else {
|
|
9
|
+
return message;
|
|
10
|
+
}
|
|
11
|
+
};
|
|
12
|
+
export const _normalizeToOpenAIMessagesList = (messages) => {
|
|
13
|
+
let messagesList;
|
|
14
|
+
if (!Array.isArray(messages)) {
|
|
15
|
+
if ("messages" in messages && Array.isArray(messages.messages)) {
|
|
16
|
+
messagesList = messages.messages;
|
|
17
|
+
}
|
|
18
|
+
else {
|
|
19
|
+
throw new Error(`If passing messages as an object, it must contain a "messages" key`);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
else {
|
|
23
|
+
messagesList = messages;
|
|
24
|
+
}
|
|
25
|
+
return messagesList.map(_convertToOpenAIMessage);
|
|
26
|
+
};
|
|
27
|
+
export const processScore = (_, value) => {
|
|
28
|
+
if (typeof value === "object") {
|
|
29
|
+
if (value != null && "score" in value) {
|
|
30
|
+
return [
|
|
31
|
+
value.score,
|
|
32
|
+
"reasoning" in value && typeof value.reasoning === "string"
|
|
33
|
+
? value.reasoning
|
|
34
|
+
: undefined,
|
|
35
|
+
];
|
|
36
|
+
}
|
|
37
|
+
else {
|
|
38
|
+
throw new Error(`Expected a dictionary with a "score" key, but got "${JSON.stringify(value, null, 2)}"`);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
return [value];
|
|
42
|
+
};
|
|
43
|
+
export const _runEvaluator = async (runName, scorer, feedbackKey, extra) => {
|
|
44
|
+
const runScorer = async (params) => {
|
|
45
|
+
let score = await scorer(params);
|
|
46
|
+
let reasoning;
|
|
47
|
+
const results = [];
|
|
48
|
+
if (!Array.isArray(score) && typeof score === "object") {
|
|
49
|
+
for (const [key, value] of Object.entries(score)) {
|
|
50
|
+
const [keyScore, reasoning] = processScore(key, value);
|
|
51
|
+
results.push({ key, score: keyScore, comment: reasoning });
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
else {
|
|
55
|
+
if (Array.isArray(score)) {
|
|
56
|
+
reasoning = score[1];
|
|
57
|
+
score = score[0];
|
|
58
|
+
}
|
|
59
|
+
results.push({ key: feedbackKey, score, comment: reasoning });
|
|
60
|
+
}
|
|
61
|
+
if (results.length === 1) {
|
|
62
|
+
return results[0];
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
return results;
|
|
66
|
+
}
|
|
67
|
+
};
|
|
68
|
+
if (isInTestContext()) {
|
|
69
|
+
const res = await wrapEvaluator(runScorer)(extra ?? {}, {
|
|
70
|
+
name: runName,
|
|
71
|
+
});
|
|
72
|
+
return res;
|
|
73
|
+
}
|
|
74
|
+
else {
|
|
75
|
+
const res = await runScorer(extra ?? {});
|
|
76
|
+
return res;
|
|
77
|
+
}
|
|
78
|
+
};
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
exports.CONCISENESS_PROMPT = exports.CORRECTNESS_PROMPT = exports.HALLUCINATION_PROMPT = exports.DEFAULT_PROMPT = exports.createTrajectoryLLMAsJudge = exports.trajectoryUnorderedMatch = exports.trajectorySuperset = exports.trajectorySubset = exports.trajectoryStrictMatch = exports.createLLMAsJudge = exports.levenshteinDistance = exports.createEmbeddingSimilarityEvaluator = exports.exactMatch = void 0;
|
|
18
|
+
var exact_js_1 = require("./evaluators/exact.cjs");
|
|
19
|
+
Object.defineProperty(exports, "exactMatch", { enumerable: true, get: function () { return exact_js_1.exactMatch; } });
|
|
20
|
+
var embedding_similarity_js_1 = require("./evaluators/string/embedding_similarity.cjs");
|
|
21
|
+
Object.defineProperty(exports, "createEmbeddingSimilarityEvaluator", { enumerable: true, get: function () { return embedding_similarity_js_1.createEmbeddingSimilarityEvaluator; } });
|
|
22
|
+
var levenshtein_js_1 = require("./evaluators/string/levenshtein.cjs");
|
|
23
|
+
Object.defineProperty(exports, "levenshteinDistance", { enumerable: true, get: function () { return levenshtein_js_1.levenshteinDistance; } });
|
|
24
|
+
var llm_js_1 = require("./evaluators/llm.cjs");
|
|
25
|
+
Object.defineProperty(exports, "createLLMAsJudge", { enumerable: true, get: function () { return llm_js_1.createLLMAsJudge; } });
|
|
26
|
+
var strict_js_1 = require("./evaluators/trajectory/strict.cjs");
|
|
27
|
+
Object.defineProperty(exports, "trajectoryStrictMatch", { enumerable: true, get: function () { return strict_js_1.trajectoryStrictMatch; } });
|
|
28
|
+
var subset_js_1 = require("./evaluators/trajectory/subset.cjs");
|
|
29
|
+
Object.defineProperty(exports, "trajectorySubset", { enumerable: true, get: function () { return subset_js_1.trajectorySubset; } });
|
|
30
|
+
var superset_js_1 = require("./evaluators/trajectory/superset.cjs");
|
|
31
|
+
Object.defineProperty(exports, "trajectorySuperset", { enumerable: true, get: function () { return superset_js_1.trajectorySuperset; } });
|
|
32
|
+
var unordered_js_1 = require("./evaluators/trajectory/unordered.cjs");
|
|
33
|
+
Object.defineProperty(exports, "trajectoryUnorderedMatch", { enumerable: true, get: function () { return unordered_js_1.trajectoryUnorderedMatch; } });
|
|
34
|
+
var llm_js_2 = require("./evaluators/trajectory/llm.cjs");
|
|
35
|
+
Object.defineProperty(exports, "createTrajectoryLLMAsJudge", { enumerable: true, get: function () { return llm_js_2.createTrajectoryLLMAsJudge; } });
|
|
36
|
+
Object.defineProperty(exports, "DEFAULT_PROMPT", { enumerable: true, get: function () { return llm_js_2.DEFAULT_PROMPT; } });
|
|
37
|
+
var hallucination_js_1 = require("./evaluators/prompts/hallucination.cjs");
|
|
38
|
+
Object.defineProperty(exports, "HALLUCINATION_PROMPT", { enumerable: true, get: function () { return hallucination_js_1.HALLUCINATION_PROMPT; } });
|
|
39
|
+
var correctness_js_1 = require("./evaluators/prompts/correctness.cjs");
|
|
40
|
+
Object.defineProperty(exports, "CORRECTNESS_PROMPT", { enumerable: true, get: function () { return correctness_js_1.CORRECTNESS_PROMPT; } });
|
|
41
|
+
var conciseness_js_1 = require("./evaluators/prompts/conciseness.cjs");
|
|
42
|
+
Object.defineProperty(exports, "CONCISENESS_PROMPT", { enumerable: true, get: function () { return conciseness_js_1.CONCISENESS_PROMPT; } });
|
|
43
|
+
__exportStar(require("./evaluators/types.cjs"), exports);
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export { exactMatch } from "./evaluators/exact.js";
|
|
2
|
+
export { createEmbeddingSimilarityEvaluator } from "./evaluators/string/embedding_similarity.js";
|
|
3
|
+
export { levenshteinDistance } from "./evaluators/string/levenshtein.js";
|
|
4
|
+
export { createLLMAsJudge } from "./evaluators/llm.js";
|
|
5
|
+
export { trajectoryStrictMatch } from "./evaluators/trajectory/strict.js";
|
|
6
|
+
export { trajectorySubset } from "./evaluators/trajectory/subset.js";
|
|
7
|
+
export { trajectorySuperset } from "./evaluators/trajectory/superset.js";
|
|
8
|
+
export { trajectoryUnorderedMatch } from "./evaluators/trajectory/unordered.js";
|
|
9
|
+
export { createTrajectoryLLMAsJudge, DEFAULT_PROMPT, } from "./evaluators/trajectory/llm.js";
|
|
10
|
+
export { HALLUCINATION_PROMPT } from "./evaluators/prompts/hallucination.js";
|
|
11
|
+
export { CORRECTNESS_PROMPT } from "./evaluators/prompts/correctness.js";
|
|
12
|
+
export { CONCISENESS_PROMPT } from "./evaluators/prompts/conciseness.js";
|
|
13
|
+
export * from "./evaluators/types.js";
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export { exactMatch } from "./evaluators/exact.js";
|
|
2
|
+
export { createEmbeddingSimilarityEvaluator } from "./evaluators/string/embedding_similarity.js";
|
|
3
|
+
export { levenshteinDistance } from "./evaluators/string/levenshtein.js";
|
|
4
|
+
export { createLLMAsJudge } from "./evaluators/llm.js";
|
|
5
|
+
export { trajectoryStrictMatch } from "./evaluators/trajectory/strict.js";
|
|
6
|
+
export { trajectorySubset } from "./evaluators/trajectory/subset.js";
|
|
7
|
+
export { trajectorySuperset } from "./evaluators/trajectory/superset.js";
|
|
8
|
+
export { trajectoryUnorderedMatch } from "./evaluators/trajectory/unordered.js";
|
|
9
|
+
export { createTrajectoryLLMAsJudge, DEFAULT_PROMPT, } from "./evaluators/trajectory/llm.js";
|
|
10
|
+
export { HALLUCINATION_PROMPT } from "./evaluators/prompts/hallucination.js";
|
|
11
|
+
export { CORRECTNESS_PROMPT } from "./evaluators/prompts/correctness.js";
|
|
12
|
+
export { CONCISENESS_PROMPT } from "./evaluators/prompts/conciseness.js";
|
|
13
|
+
export * from "./evaluators/types.js";
|
package/index.cjs
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
module.exports = require('./dist/index.cjs');
|
package/index.d.cts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from './dist/index.js'
|
package/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from './dist/index.js'
|
package/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from './dist/index.js'
|
package/package.json
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "agentevals",
|
|
3
|
+
"version": "0.0.0",
|
|
4
|
+
"packageManager": "yarn@4.3.1",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"scripts": {
|
|
7
|
+
"build": "yarn lc_build --create-entrypoints --pre --tree-shaking",
|
|
8
|
+
"lint:eslint": "NODE_OPTIONS=--max-old-space-size=4096 eslint --cache --ext .ts,.js src/",
|
|
9
|
+
"lint:dpdm": "dpdm --exit-code circular:1 --no-warning --no-tree src/*.ts src/**/*.ts",
|
|
10
|
+
"lint": "yarn lint:eslint && yarn lint:dpdm",
|
|
11
|
+
"lint:fix": "yarn lint:eslint --fix && yarn lint:dpdm",
|
|
12
|
+
"format": "prettier --config .prettierrc --write \"src\"",
|
|
13
|
+
"format:check": "prettier --config .prettierrc --check \"src\"",
|
|
14
|
+
"test": "vitest run"
|
|
15
|
+
},
|
|
16
|
+
"dependencies": {
|
|
17
|
+
"@langchain/openai": "^0.4.4",
|
|
18
|
+
"langchain": "^0.3.18",
|
|
19
|
+
"langsmith": "^0.3.10-rc.1"
|
|
20
|
+
},
|
|
21
|
+
"peerDependencies": {
|
|
22
|
+
"@langchain/core": "^0.3.40"
|
|
23
|
+
},
|
|
24
|
+
"devDependencies": {
|
|
25
|
+
"@langchain/core": "^0.3.40",
|
|
26
|
+
"@langchain/scripts": "0.1.3",
|
|
27
|
+
"@tsconfig/recommended": "^1.0.8",
|
|
28
|
+
"dotenv": "^16.4.7",
|
|
29
|
+
"eslint": "^8.33.0",
|
|
30
|
+
"eslint-config-airbnb-base": "^15.0.0",
|
|
31
|
+
"eslint-config-prettier": "^8.6.0",
|
|
32
|
+
"eslint-plugin-import": "^2.27.5",
|
|
33
|
+
"eslint-plugin-jest": "^27.6.0",
|
|
34
|
+
"eslint-plugin-no-instanceof": "^1.0.1",
|
|
35
|
+
"eslint-plugin-prettier": "^4.2.1",
|
|
36
|
+
"openai": "^4.85.1",
|
|
37
|
+
"prettier": "^3.5.1",
|
|
38
|
+
"typescript": "~5.1.6",
|
|
39
|
+
"vitest": "^3.0.5"
|
|
40
|
+
},
|
|
41
|
+
"files": [
|
|
42
|
+
"dist/",
|
|
43
|
+
"index.cjs",
|
|
44
|
+
"index.js",
|
|
45
|
+
"index.d.ts",
|
|
46
|
+
"index.d.cts"
|
|
47
|
+
],
|
|
48
|
+
"exports": {
|
|
49
|
+
".": {
|
|
50
|
+
"types": {
|
|
51
|
+
"import": "./index.d.ts",
|
|
52
|
+
"require": "./index.d.cts",
|
|
53
|
+
"default": "./index.d.ts"
|
|
54
|
+
},
|
|
55
|
+
"import": "./index.js",
|
|
56
|
+
"require": "./index.cjs"
|
|
57
|
+
},
|
|
58
|
+
"./package.json": "./package.json"
|
|
59
|
+
}
|
|
60
|
+
}
|