@dvina/agents 0.4.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-LEEZCLZM.mjs → chunk-NHWEEBN2.mjs} +65 -11
- package/dist/chunk-NHWEEBN2.mjs.map +1 -0
- package/dist/eval/index.d.mts +31 -23
- package/dist/eval/index.d.ts +31 -23
- package/dist/eval/index.js +180 -56
- package/dist/eval/index.js.map +1 -1
- package/dist/eval/index.mjs +139 -68
- package/dist/eval/index.mjs.map +1 -1
- package/dist/index.d.mts +4 -125
- package/dist/index.d.ts +4 -125
- package/dist/index.js +74 -42
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +7 -30
- package/dist/index.mjs.map +1 -1
- package/dist/model-resolver-U0J9x1a6.d.mts +158 -0
- package/dist/model-resolver-U0J9x1a6.d.ts +158 -0
- package/package.json +1 -1
- package/dist/chunk-LEEZCLZM.mjs.map +0 -1
- package/dist/model-resolver-BRAaBV9n.d.mts +0 -15
- package/dist/model-resolver-BRAaBV9n.d.ts +0 -15
|
@@ -71,18 +71,21 @@ var LangchainModelResolver = class {
|
|
|
71
71
|
tags
|
|
72
72
|
});
|
|
73
73
|
}
|
|
74
|
-
resolveAzure(
|
|
75
|
-
const
|
|
76
|
-
if (!
|
|
77
|
-
throw new Error(`
|
|
74
|
+
resolveAzure(resourceName, modelName, tags) {
|
|
75
|
+
const resource = this.config.azure?.[resourceName];
|
|
76
|
+
if (!resource) {
|
|
77
|
+
throw new Error(`Resource "${resourceName}" for provider "azure" is missing`);
|
|
78
|
+
}
|
|
79
|
+
const modelEntry = resource.models.find((m) => m.model === modelName);
|
|
80
|
+
if (!modelEntry) {
|
|
81
|
+
throw new Error(`Model "${modelName}" not found in Azure resource "${resourceName}"`);
|
|
78
82
|
}
|
|
79
83
|
return new AzureChatOpenAI({
|
|
80
|
-
model:
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
azureOpenAIApiVersion: providerConfig.apiVersion,
|
|
84
|
+
model: modelEntry.model,
|
|
85
|
+
azureOpenAIApiKey: resource.apiKey,
|
|
86
|
+
azureOpenAIApiInstanceName: this.extractInstanceName(resource.endpoint),
|
|
87
|
+
azureOpenAIApiDeploymentName: modelEntry.deploymentName,
|
|
88
|
+
azureOpenAIApiVersion: modelEntry.apiVersion,
|
|
86
89
|
tags
|
|
87
90
|
});
|
|
88
91
|
}
|
|
@@ -96,10 +99,61 @@ var LangchainModelResolver = class {
|
|
|
96
99
|
}
|
|
97
100
|
};
|
|
98
101
|
|
|
102
|
+
// src/runtime/langchain/utils.ts
|
|
103
|
+
import { AIMessage, HumanMessage, ToolMessage } from "langchain";
|
|
104
|
+
function convertToLangchainMessages(messages) {
|
|
105
|
+
const result = [];
|
|
106
|
+
let tcIdx = 0;
|
|
107
|
+
let pendingToolCallIds = [];
|
|
108
|
+
for (const msg of messages) {
|
|
109
|
+
if (msg.role === "human") {
|
|
110
|
+
result.push(
|
|
111
|
+
new HumanMessage({
|
|
112
|
+
content: msg.content.map((c) => {
|
|
113
|
+
if (c.type === "image") {
|
|
114
|
+
return { type: "image_url", image_url: { url: c.url } };
|
|
115
|
+
}
|
|
116
|
+
return c;
|
|
117
|
+
})
|
|
118
|
+
})
|
|
119
|
+
);
|
|
120
|
+
} else if (msg.role === "ai") {
|
|
121
|
+
if (msg.toolCalls && msg.toolCalls.length > 0) {
|
|
122
|
+
pendingToolCallIds = msg.toolCalls.map(() => `tc_${++tcIdx}`);
|
|
123
|
+
result.push(
|
|
124
|
+
new AIMessage({
|
|
125
|
+
content: msg.content,
|
|
126
|
+
tool_calls: msg.toolCalls.map((tc, i) => ({
|
|
127
|
+
id: pendingToolCallIds[i],
|
|
128
|
+
name: tc.name,
|
|
129
|
+
args: tc.input ? JSON.parse(tc.input) : {}
|
|
130
|
+
}))
|
|
131
|
+
})
|
|
132
|
+
);
|
|
133
|
+
} else {
|
|
134
|
+
result.push(new AIMessage(msg.content));
|
|
135
|
+
}
|
|
136
|
+
} else if (msg.role === "tool") {
|
|
137
|
+
const toolCallId = pendingToolCallIds.shift();
|
|
138
|
+
if (!toolCallId)
|
|
139
|
+
throw new Error(`ToolMessage for "${msg.name}" without a preceding AiMessage with toolCalls`);
|
|
140
|
+
result.push(
|
|
141
|
+
new ToolMessage({
|
|
142
|
+
content: msg.output,
|
|
143
|
+
tool_call_id: toolCallId,
|
|
144
|
+
name: msg.name
|
|
145
|
+
})
|
|
146
|
+
);
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
return result;
|
|
150
|
+
}
|
|
151
|
+
|
|
99
152
|
export {
|
|
100
153
|
__require,
|
|
101
154
|
__commonJS,
|
|
102
155
|
__toESM,
|
|
156
|
+
convertToLangchainMessages,
|
|
103
157
|
LangchainModelResolver
|
|
104
158
|
};
|
|
105
|
-
//# sourceMappingURL=chunk-
|
|
159
|
+
//# sourceMappingURL=chunk-NHWEEBN2.mjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/runtime/langchain/model-resolver.ts","../src/runtime/langchain/utils.ts"],"sourcesContent":["import { BaseLanguageModel } from '@langchain/core/language_models/base';\nimport { AzureChatOpenAI, ChatOpenAI } from '@langchain/openai';\n\nexport type LangchainOpenAIConfig = {\n\tapiKey: string;\n};\n\nexport type LangchainAzureResourceConfig = {\n\tapiKey: string;\n\tendpoint: string;\n\tmodels: {\n\t\tmodel: string;\n\t\tapiVersion: string;\n\t\tdeploymentName: string;\n\t}[];\n};\n\nexport type ResourceName = string;\n\nexport type LangchainModelConfig = {\n\topenai?: Record<string, LangchainOpenAIConfig>;\n\tazure?: Record<ResourceName, LangchainAzureResourceConfig>;\n};\n\nexport class LangchainModelResolver {\n\tconstructor(private config: LangchainModelConfig) {}\n\n\tresolve(modelString: string, tags?: string[]): BaseLanguageModel {\n\t\tconst parts = modelString.split(':');\n\n\t\tif (parts.length === 2) {\n\t\t\tconst [provider, modelName] = parts;\n\t\t\treturn this.resolveByProvider(provider, 'default', modelName, tags);\n\t\t}\n\n\t\tif (parts.length === 3) {\n\t\t\tconst [provider, configName, modelName] = parts;\n\t\t\treturn this.resolveByProvider(provider, configName, modelName, tags);\n\t\t}\n\n\t\tthrow new Error(\n\t\t\t'Model string must follow format \"provider:modelName\" (uses \"default\" config) or \"provider:configName:modelName\"',\n\t\t);\n\t}\n\n\tprivate resolveByProvider(\n\t\tprovider: string,\n\t\tconfigName: string,\n\t\tmodelName: string,\n\t\ttags?: string[],\n\t): BaseLanguageModel {\n\t\tswitch (provider) {\n\t\t\tcase 'openai':\n\t\t\t\treturn this.resolveOpenAI(configName, modelName, tags);\n\t\t\tcase 'azure':\n\t\t\t\treturn this.resolveAzure(configName, modelName, tags);\n\t\t\tdefault:\n\t\t\t\tthrow new Error(`Unsupported model provider: ${provider}`);\n\t\t}\n\t}\n\n\tprivate resolveOpenAI(configName: string, modelName: string, tags?: string[]): ChatOpenAI {\n\t\tconst providerConfig = this.config.openai?.[configName];\n\t\tif (!providerConfig) {\n\t\t\tthrow new Error(`Configuration \"${configName}\" for provider \"openai\" is missing`);\n\t\t}\n\n\t\treturn new ChatOpenAI({\n\t\t\tapiKey: providerConfig.apiKey,\n\t\t\tmodelName: modelName,\n\t\t\ttags: tags,\n\t\t});\n\t}\n\n\tprivate resolveAzure(resourceName: string, modelName: string, tags?: string[]): AzureChatOpenAI {\n\t\tconst resource = this.config.azure?.[resourceName];\n\t\tif (!resource) {\n\t\t\tthrow new Error(`Resource \"${resourceName}\" for provider \"azure\" is missing`);\n\t\t}\n\n\t\tconst modelEntry = resource.models.find((m) => m.model === modelName);\n\t\tif (!modelEntry) {\n\t\t\tthrow new Error(`Model \"${modelName}\" not found in Azure resource \"${resourceName}\"`);\n\t\t}\n\n\t\treturn new AzureChatOpenAI({\n\t\t\tmodel: modelEntry.model,\n\t\t\tazureOpenAIApiKey: resource.apiKey,\n\t\t\tazureOpenAIApiInstanceName: this.extractInstanceName(resource.endpoint),\n\t\t\tazureOpenAIApiDeploymentName: modelEntry.deploymentName,\n\t\t\tazureOpenAIApiVersion: modelEntry.apiVersion,\n\t\t\ttags: tags,\n\t\t});\n\t}\n\n\tprivate extractInstanceName(endpoint: string): string {\n\t\ttry {\n\t\t\tconst url = new URL(endpoint);\n\t\t\treturn url.hostname.split('.')[0];\n\t\t} catch (e) {\n\t\t\treturn endpoint;\n\t\t}\n\t}\n}\n","import { Message } from '@core/agent.interface';\nimport { AIMessage, BaseMessage, HumanMessage, ToolMessage } from 'langchain';\n\nexport function convertToLangchainMessages(messages: Message[]): BaseMessage[] {\n\tconst result: BaseMessage[] = [];\n\tlet tcIdx = 0;\n\tlet pendingToolCallIds: string[] = [];\n\n\tfor (const msg of messages) {\n\t\tif (msg.role === 'human') {\n\t\t\tresult.push(\n\t\t\t\tnew HumanMessage({\n\t\t\t\t\tcontent: msg.content.map((c) => {\n\t\t\t\t\t\tif (c.type === 'image') {\n\t\t\t\t\t\t\treturn { type: 'image_url', image_url: { url: c.url } };\n\t\t\t\t\t\t}\n\t\t\t\t\t\treturn c;\n\t\t\t\t\t}) as any,\n\t\t\t\t}),\n\t\t\t);\n\t\t} else if (msg.role === 'ai') {\n\t\t\tif (msg.toolCalls && msg.toolCalls.length > 0) {\n\t\t\t\tpendingToolCallIds = msg.toolCalls.map(() => `tc_${++tcIdx}`);\n\t\t\t\tresult.push(\n\t\t\t\t\tnew AIMessage({\n\t\t\t\t\t\tcontent: msg.content,\n\t\t\t\t\t\ttool_calls: msg.toolCalls.map((tc, i) => ({\n\t\t\t\t\t\t\tid: pendingToolCallIds[i],\n\t\t\t\t\t\t\tname: tc.name,\n\t\t\t\t\t\t\targs: tc.input ? JSON.parse(tc.input) : {},\n\t\t\t\t\t\t})),\n\t\t\t\t\t}),\n\t\t\t\t);\n\t\t\t} else {\n\t\t\t\tresult.push(new AIMessage(msg.content));\n\t\t\t}\n\t\t} else if (msg.role === 'tool') {\n\t\t\tconst toolCallId = pendingToolCallIds.shift();\n\t\t\tif (!toolCallId)\n\t\t\t\tthrow new Error(`ToolMessage for \"${msg.name}\" without a preceding AiMessage with toolCalls`);\n\t\t\tresult.push(\n\t\t\t\tnew ToolMessage({\n\t\t\t\t\tcontent: msg.output,\n\t\t\t\t\ttool_call_id: toolCallId,\n\t\t\t\t\tname: msg.name,\n\t\t\t\t}),\n\t\t\t);\n\t\t}\n\t}\n\n\treturn result;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AACA,SAAS,iBAAiB,kBAAkB;AAuBrC,IAAM,yBAAN,MAA6B;AAAA,EACnC,YAAoB,QAA8B;AAA9B;AAAA,EAA+B;AAAA,EAEnD,QAAQ,aAAqB,MAAoC;AAChE,UAAM,QAAQ,YAAY,MAAM,GAAG;AAEnC,QAAI,MAAM,WAAW,GAAG;AACvB,YAAM,CAAC,UAAU,SAAS,IAAI;AAC9B,aAAO,KAAK,kBAAkB,UAAU,WAAW,WAAW,IAAI;AAAA,IACnE;AAEA,QAAI,MAAM,WAAW,GAAG;AACvB,YAAM,CAAC,UAAU,YAAY,SAAS,IAAI;AAC1C,aAAO,KAAK,kBAAkB,UAAU,YAAY,WAAW,IAAI;AAAA,IACpE;AAEA,UAAM,IAAI;AAAA,MACT;AAAA,IACD;AAAA,EACD;AAAA,EAEQ,kBACP,UACA,YACA,WACA,MACoB;AACpB,YAAQ,UAAU;AAAA,MACjB,KAAK;AACJ,eAAO,KAAK,cAAc,YAAY,WAAW,IAAI;AAAA,MACtD,KAAK;AACJ,eAAO,KAAK,aAAa,YAAY,WAAW,IAAI;AAAA,MACrD;AACC,cAAM,IAAI,MAAM,+BAA+B,QAAQ,EAAE;AAAA,IAC3D;AAAA,EACD;AAAA,EAEQ,cAAc,YAAoB,WAAmB,MAA6B;AACzF,UAAM,iBAAiB,KAAK,OAAO,SAAS,UAAU;AACtD,QAAI,CAAC,gBAAgB;AACpB,YAAM,IAAI,MAAM,kBAAkB,UAAU,oCAAoC;AAAA,IACjF;AAEA,WAAO,IAAI,WAAW;AAAA,MACrB,QAAQ,eAAe;AAAA,MACvB;AAAA,MACA;AAAA,IACD,CAAC;AAAA,EACF;AAAA,EAEQ,aAAa,cAAsB,WAAmB,MAAkC;AAC/F,UAAM,WAAW,KAAK,OAAO,QAAQ,YAAY;AACjD,QAAI,CAAC,UAAU;AACd,YAAM,IAAI,MAAM,aAAa,YAAY,mCAAmC;AAAA,IAC7E;AAEA,UAAM,aAAa,SAAS,OAAO,KAAK,CAAC,MAAM,EAAE,UAAU,SAAS;AACpE,QAAI,CAAC,YAAY;AAChB,YAAM,IAAI,MAAM,UAAU,SAAS,kCAAkC,YAAY,GAAG;AAAA,IACrF;AAEA,WAAO,IAAI,gBAAgB;AAAA,MAC1B,OAAO,WAAW;AAAA,MAClB,mBAAmB,SAAS;AAAA,MAC5B,4BAA4B,KAAK,oBAAoB,SAAS,QAAQ;AAAA,MACtE,8BAA8B,WAAW;AAAA,MACzC,uBAAuB,WAAW;AAAA,MAClC;AAAA,IACD,CAAC;AAAA,EACF;AAAA,EAEQ,oBAAoB,UAA0B;AACrD,QAAI;AACH,YAAM,MAAM,IAAI,IAAI,QAAQ;AAC5B,aAAO,IAAI,SAAS,MAAM,GAAG,EAAE,CAAC;AAAA,IACjC,SAAS,GAAG;AACX,aAAO;AAAA,IACR;AAAA,EACD;AACD;;;ACtGA,SAAS,WAAwB,cAAc,mBAAmB;AAE3D,SAAS,2BAA2B,UAAoC;AAC9E,QAAM,SAAwB,CAAC;AAC/B,MAAI,QAAQ;AACZ,MAAI,qBAA+B,CAAC;AAEpC,aAAW,OAAO,UAAU;AAC3B,QAAI,IAAI,SAAS,SAAS;AACzB,aAAO;AAAA,QACN,IAAI,aAAa;AAAA,UAChB,SAAS,IAAI,QAAQ,IAAI,CAAC,MAAM;AAC/B,gBAAI,EAAE,SAAS,SAAS;AACvB,qBAAO,EAAE,MAAM,aAAa,WAAW,EAAE,KAAK,EAAE,IAAI,EAAE;AAAA,YACvD;AACA,mBAAO;AAAA,UACR,CAAC;AAAA,QACF,CAAC;AAAA,MACF;AAAA,IACD,WAAW,IAAI,SAAS,MAAM;AAC7B,UAAI,IAAI,aAAa,IAAI,UAAU,SAAS,GAAG;AAC9C,6BAAqB,IAAI,UAAU,IAAI,MAAM,MAAM,EAAE,KAAK,EAAE;AAC5D,eAAO;AAAA,UACN,IAAI,UAAU;AAAA,YACb,SAAS,IAAI;AAAA,YACb,YAAY,IAAI,UAAU,IAAI,CAAC,IAAI,OAAO;AAAA,cACzC,IAAI,mBAAmB,CAAC;AAAA,cACxB,MAAM,GAAG;AAAA,cACT,MAAM,GAAG,QAAQ,KAAK,MAAM,GAAG,KAAK,IAAI,CAAC;AAAA,YAC1C,EAAE;AAAA,UACH,CAAC;AAAA,QACF;AAAA,MACD,OAAO;AACN,eAAO,KAAK,IAAI,UAAU,IAAI,OAAO,CAAC;AAAA,MACvC;AAAA,IACD,WAAW,IAAI,SAAS,QAAQ;AAC/B,YAAM,aAAa,mBAAmB,MAAM;AAC5C,UAAI,CAAC;AACJ,cAAM,IAAI,MAAM,oBAAoB,IAAI,IAAI,gDAAgD;AAC7F,aAAO;AAAA,QACN,IAAI,YAAY;AAAA,UACf,SAAS,IAAI;AAAA,UACb,cAAc;AAAA,UACd,MAAM,IAAI;AAAA,QACX,CAAC;AAAA,MACF;AAAA,IACD;AAAA,EACD;AAEA,SAAO;AACR;","names":[]}
|
package/dist/eval/index.d.mts
CHANGED
|
@@ -1,20 +1,28 @@
|
|
|
1
|
-
import { L as LangchainModelConfig } from '../model-resolver-
|
|
1
|
+
import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, d as AiMessage, a as ToolSpec, H as HumanMessage, f as ToolMessage } from '../model-resolver-U0J9x1a6.mjs';
|
|
2
|
+
import * as zod from 'zod';
|
|
3
|
+
import { z } from 'zod';
|
|
2
4
|
import { BaseMessage } from '@langchain/core/messages';
|
|
3
5
|
|
|
6
|
+
/** Factory that creates a fresh Agent per test case. Receives extra suite-level tools as ToolDefinition[]. */
|
|
7
|
+
type CreateTargetFn = (extraTools: ToolDefinition[]) => Agent | Promise<Agent>;
|
|
4
8
|
interface EvalConfig {
|
|
9
|
+
/** Required for model-based target and LLM evaluators (respondsInLanguage, llmJudge). */
|
|
5
10
|
modelConfig: LangchainModelConfig;
|
|
6
|
-
model
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
11
|
+
/** Required for model-based target. Also used as fallback for evaluatorModel. */
|
|
12
|
+
model?: string;
|
|
13
|
+
/** Model for evaluators needing LLM calls (language detection, LLM-as-judge). */
|
|
14
|
+
evaluatorModel: string;
|
|
15
|
+
/** System prompt for model-based target. Ignored when createTarget is used. Can be overridden per-suite or per-case. */
|
|
10
16
|
systemPrompt?: string;
|
|
17
|
+
/** Factory that creates a fresh Agent per test case. When set, this is the default target. */
|
|
18
|
+
createTarget?: CreateTargetFn;
|
|
11
19
|
}
|
|
12
20
|
declare function configureEvals(config: EvalConfig): void;
|
|
13
21
|
|
|
14
22
|
interface MockToolDef {
|
|
15
23
|
name: string;
|
|
16
24
|
description: string;
|
|
17
|
-
schema: Record<string, unknown>;
|
|
25
|
+
schema: z.ZodObject<any> | Record<string, unknown>;
|
|
18
26
|
/**
|
|
19
27
|
* Canned response the mock tool returns.
|
|
20
28
|
* Can be a static string, or a function that receives input and returns a response.
|
|
@@ -60,23 +68,13 @@ declare function contains(strings: string[]): Expectation;
|
|
|
60
68
|
/** Assert the response does not contain any of the given strings. */
|
|
61
69
|
declare function notContains(strings: string[]): Expectation;
|
|
62
70
|
|
|
63
|
-
declare function human(content: string):
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
};
|
|
67
|
-
declare function ai(content: string, toolCalls?: string[]): {
|
|
68
|
-
toolCalls?: string[] | undefined;
|
|
69
|
-
role: "ai";
|
|
70
|
-
content: string;
|
|
71
|
-
};
|
|
72
|
-
declare function toolResult(content: string): {
|
|
73
|
-
role: "tool";
|
|
74
|
-
content: string;
|
|
75
|
-
};
|
|
76
|
-
type Message = ReturnType<typeof human> | ReturnType<typeof ai> | ReturnType<typeof toolResult>;
|
|
71
|
+
declare function human(content: string): HumanMessage;
|
|
72
|
+
declare function ai(content: string, toolCalls?: string[]): AiMessage;
|
|
73
|
+
declare function toolResult(name: string, output: string): ToolMessage;
|
|
77
74
|
interface ToolDef {
|
|
78
75
|
description: string;
|
|
79
|
-
|
|
76
|
+
/** A plain key→description record, or a ZodObject passed through from a ToolSpec. */
|
|
77
|
+
schema?: Record<string, string> | zod.ZodObject<any>;
|
|
80
78
|
/** Auto-stringified if not a string or function. */
|
|
81
79
|
response: unknown | ((input: Record<string, unknown>, callCount: number) => string);
|
|
82
80
|
}
|
|
@@ -99,11 +97,21 @@ type TargetFn = (inputs: {
|
|
|
99
97
|
interface SuiteConfig {
|
|
100
98
|
/** Custom target function, or model string override. Auto-created from global config if omitted. */
|
|
101
99
|
target?: TargetFn | string;
|
|
100
|
+
/** Factory that creates a fresh Agent per test case. Overrides global createTarget. */
|
|
101
|
+
createTarget?: CreateTargetFn;
|
|
102
102
|
/** System prompt for all cases in this suite. Overrides the global prompt; can be overridden per-case. */
|
|
103
103
|
systemPrompt?: string;
|
|
104
|
-
tools
|
|
104
|
+
tools?: Record<string, ToolDef>;
|
|
105
105
|
cases: TestCase[];
|
|
106
106
|
}
|
|
107
|
+
/**
|
|
108
|
+
* Converts a `ToolSpec[]` (from a real tool provider) into the
|
|
109
|
+
* `Record<string, ToolDef>` that `defineSuite` expects.
|
|
110
|
+
*
|
|
111
|
+
* `responses` maps tool names to canned mock responses. Tools without an
|
|
112
|
+
* entry in `responses` default to `''`.
|
|
113
|
+
*/
|
|
114
|
+
declare function fromToolSpecs(specs: ToolSpec[], responses?: Record<string, ToolDef['response']>): Record<string, ToolDef>;
|
|
107
115
|
declare function defineSuite(name: string, config: SuiteConfig): void;
|
|
108
116
|
|
|
109
|
-
export { type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
|
|
117
|
+
export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
|
package/dist/eval/index.d.ts
CHANGED
|
@@ -1,20 +1,28 @@
|
|
|
1
|
-
import { L as LangchainModelConfig } from '../model-resolver-
|
|
1
|
+
import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, d as AiMessage, a as ToolSpec, H as HumanMessage, f as ToolMessage } from '../model-resolver-U0J9x1a6.js';
|
|
2
|
+
import * as zod from 'zod';
|
|
3
|
+
import { z } from 'zod';
|
|
2
4
|
import { BaseMessage } from '@langchain/core/messages';
|
|
3
5
|
|
|
6
|
+
/** Factory that creates a fresh Agent per test case. Receives extra suite-level tools as ToolDefinition[]. */
|
|
7
|
+
type CreateTargetFn = (extraTools: ToolDefinition[]) => Agent | Promise<Agent>;
|
|
4
8
|
interface EvalConfig {
|
|
9
|
+
/** Required for model-based target and LLM evaluators (respondsInLanguage, llmJudge). */
|
|
5
10
|
modelConfig: LangchainModelConfig;
|
|
6
|
-
model
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
11
|
+
/** Required for model-based target. Also used as fallback for evaluatorModel. */
|
|
12
|
+
model?: string;
|
|
13
|
+
/** Model for evaluators needing LLM calls (language detection, LLM-as-judge). */
|
|
14
|
+
evaluatorModel: string;
|
|
15
|
+
/** System prompt for model-based target. Ignored when createTarget is used. Can be overridden per-suite or per-case. */
|
|
10
16
|
systemPrompt?: string;
|
|
17
|
+
/** Factory that creates a fresh Agent per test case. When set, this is the default target. */
|
|
18
|
+
createTarget?: CreateTargetFn;
|
|
11
19
|
}
|
|
12
20
|
declare function configureEvals(config: EvalConfig): void;
|
|
13
21
|
|
|
14
22
|
interface MockToolDef {
|
|
15
23
|
name: string;
|
|
16
24
|
description: string;
|
|
17
|
-
schema: Record<string, unknown>;
|
|
25
|
+
schema: z.ZodObject<any> | Record<string, unknown>;
|
|
18
26
|
/**
|
|
19
27
|
* Canned response the mock tool returns.
|
|
20
28
|
* Can be a static string, or a function that receives input and returns a response.
|
|
@@ -60,23 +68,13 @@ declare function contains(strings: string[]): Expectation;
|
|
|
60
68
|
/** Assert the response does not contain any of the given strings. */
|
|
61
69
|
declare function notContains(strings: string[]): Expectation;
|
|
62
70
|
|
|
63
|
-
declare function human(content: string):
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
};
|
|
67
|
-
declare function ai(content: string, toolCalls?: string[]): {
|
|
68
|
-
toolCalls?: string[] | undefined;
|
|
69
|
-
role: "ai";
|
|
70
|
-
content: string;
|
|
71
|
-
};
|
|
72
|
-
declare function toolResult(content: string): {
|
|
73
|
-
role: "tool";
|
|
74
|
-
content: string;
|
|
75
|
-
};
|
|
76
|
-
type Message = ReturnType<typeof human> | ReturnType<typeof ai> | ReturnType<typeof toolResult>;
|
|
71
|
+
declare function human(content: string): HumanMessage;
|
|
72
|
+
declare function ai(content: string, toolCalls?: string[]): AiMessage;
|
|
73
|
+
declare function toolResult(name: string, output: string): ToolMessage;
|
|
77
74
|
interface ToolDef {
|
|
78
75
|
description: string;
|
|
79
|
-
|
|
76
|
+
/** A plain key→description record, or a ZodObject passed through from a ToolSpec. */
|
|
77
|
+
schema?: Record<string, string> | zod.ZodObject<any>;
|
|
80
78
|
/** Auto-stringified if not a string or function. */
|
|
81
79
|
response: unknown | ((input: Record<string, unknown>, callCount: number) => string);
|
|
82
80
|
}
|
|
@@ -99,11 +97,21 @@ type TargetFn = (inputs: {
|
|
|
99
97
|
interface SuiteConfig {
|
|
100
98
|
/** Custom target function, or model string override. Auto-created from global config if omitted. */
|
|
101
99
|
target?: TargetFn | string;
|
|
100
|
+
/** Factory that creates a fresh Agent per test case. Overrides global createTarget. */
|
|
101
|
+
createTarget?: CreateTargetFn;
|
|
102
102
|
/** System prompt for all cases in this suite. Overrides the global prompt; can be overridden per-case. */
|
|
103
103
|
systemPrompt?: string;
|
|
104
|
-
tools
|
|
104
|
+
tools?: Record<string, ToolDef>;
|
|
105
105
|
cases: TestCase[];
|
|
106
106
|
}
|
|
107
|
+
/**
|
|
108
|
+
* Converts a `ToolSpec[]` (from a real tool provider) into the
|
|
109
|
+
* `Record<string, ToolDef>` that `defineSuite` expects.
|
|
110
|
+
*
|
|
111
|
+
* `responses` maps tool names to canned mock responses. Tools without an
|
|
112
|
+
* entry in `responses` default to `''`.
|
|
113
|
+
*/
|
|
114
|
+
declare function fromToolSpecs(specs: ToolSpec[], responses?: Record<string, ToolDef['response']>): Record<string, ToolDef>;
|
|
107
115
|
declare function defineSuite(name: string, config: SuiteConfig): void;
|
|
108
116
|
|
|
109
|
-
export { type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
|
|
117
|
+
export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
|
package/dist/eval/index.js
CHANGED
|
@@ -34,6 +34,7 @@ __export(eval_exports, {
|
|
|
34
34
|
configureEvals: () => configureEvals,
|
|
35
35
|
contains: () => contains,
|
|
36
36
|
defineSuite: () => defineSuite,
|
|
37
|
+
fromToolSpecs: () => fromToolSpecs,
|
|
37
38
|
human: () => human,
|
|
38
39
|
llmJudge: () => llmJudge,
|
|
39
40
|
noTools: () => noTools,
|
|
@@ -105,18 +106,21 @@ var LangchainModelResolver = class {
|
|
|
105
106
|
tags
|
|
106
107
|
});
|
|
107
108
|
}
|
|
108
|
-
resolveAzure(
|
|
109
|
-
const
|
|
110
|
-
if (!
|
|
111
|
-
throw new Error(`
|
|
109
|
+
resolveAzure(resourceName, modelName, tags) {
|
|
110
|
+
const resource = this.config.azure?.[resourceName];
|
|
111
|
+
if (!resource) {
|
|
112
|
+
throw new Error(`Resource "${resourceName}" for provider "azure" is missing`);
|
|
113
|
+
}
|
|
114
|
+
const modelEntry = resource.models.find((m) => m.model === modelName);
|
|
115
|
+
if (!modelEntry) {
|
|
116
|
+
throw new Error(`Model "${modelName}" not found in Azure resource "${resourceName}"`);
|
|
112
117
|
}
|
|
113
118
|
return new import_openai.AzureChatOpenAI({
|
|
114
|
-
model:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
azureOpenAIApiVersion: providerConfig.apiVersion,
|
|
119
|
+
model: modelEntry.model,
|
|
120
|
+
azureOpenAIApiKey: resource.apiKey,
|
|
121
|
+
azureOpenAIApiInstanceName: this.extractInstanceName(resource.endpoint),
|
|
122
|
+
azureOpenAIApiDeploymentName: modelEntry.deploymentName,
|
|
123
|
+
azureOpenAIApiVersion: modelEntry.apiVersion,
|
|
120
124
|
tags
|
|
121
125
|
});
|
|
122
126
|
}
|
|
@@ -130,51 +134,64 @@ var LangchainModelResolver = class {
|
|
|
130
134
|
}
|
|
131
135
|
};
|
|
132
136
|
|
|
133
|
-
// src/
|
|
134
|
-
var
|
|
135
|
-
function
|
|
137
|
+
// src/runtime/langchain/utils.ts
|
|
138
|
+
var import_langchain = require("langchain");
|
|
139
|
+
function convertToLangchainMessages(messages) {
|
|
136
140
|
const result = [];
|
|
137
141
|
let tcIdx = 0;
|
|
138
|
-
let
|
|
139
|
-
for (const msg of
|
|
142
|
+
let pendingToolCallIds = [];
|
|
143
|
+
for (const msg of messages) {
|
|
140
144
|
if (msg.role === "human") {
|
|
141
|
-
result.push(
|
|
145
|
+
result.push(
|
|
146
|
+
new import_langchain.HumanMessage({
|
|
147
|
+
content: msg.content.map((c) => {
|
|
148
|
+
if (c.type === "image") {
|
|
149
|
+
return { type: "image_url", image_url: { url: c.url } };
|
|
150
|
+
}
|
|
151
|
+
return c;
|
|
152
|
+
})
|
|
153
|
+
})
|
|
154
|
+
);
|
|
142
155
|
} else if (msg.role === "ai") {
|
|
143
156
|
if (msg.toolCalls && msg.toolCalls.length > 0) {
|
|
144
|
-
|
|
145
|
-
id: `hist_tc${++tcIdx}`,
|
|
146
|
-
name
|
|
147
|
-
}));
|
|
157
|
+
pendingToolCallIds = msg.toolCalls.map(() => `tc_${++tcIdx}`);
|
|
148
158
|
result.push(
|
|
149
|
-
new
|
|
159
|
+
new import_langchain.AIMessage({
|
|
150
160
|
content: msg.content,
|
|
151
|
-
tool_calls:
|
|
152
|
-
id:
|
|
161
|
+
tool_calls: msg.toolCalls.map((tc, i) => ({
|
|
162
|
+
id: pendingToolCallIds[i],
|
|
153
163
|
name: tc.name,
|
|
154
|
-
args: {}
|
|
164
|
+
args: tc.input ? JSON.parse(tc.input) : {}
|
|
155
165
|
}))
|
|
156
166
|
})
|
|
157
167
|
);
|
|
158
168
|
} else {
|
|
159
|
-
result.push(new
|
|
169
|
+
result.push(new import_langchain.AIMessage(msg.content));
|
|
160
170
|
}
|
|
161
171
|
} else if (msg.role === "tool") {
|
|
162
|
-
const
|
|
163
|
-
if (!
|
|
172
|
+
const toolCallId = pendingToolCallIds.shift();
|
|
173
|
+
if (!toolCallId)
|
|
174
|
+
throw new Error(`ToolMessage for "${msg.name}" without a preceding AiMessage with toolCalls`);
|
|
164
175
|
result.push(
|
|
165
|
-
new
|
|
166
|
-
content: msg.
|
|
167
|
-
tool_call_id:
|
|
168
|
-
name:
|
|
176
|
+
new import_langchain.ToolMessage({
|
|
177
|
+
content: msg.output,
|
|
178
|
+
tool_call_id: toolCallId,
|
|
179
|
+
name: msg.name
|
|
169
180
|
})
|
|
170
181
|
);
|
|
171
182
|
}
|
|
172
183
|
}
|
|
173
184
|
return result;
|
|
174
185
|
}
|
|
186
|
+
|
|
187
|
+
// src/eval/target.ts
|
|
188
|
+
var MAX_AGENT_LOOPS = 10;
|
|
175
189
|
function createEvalTarget(modelConfig, modelString) {
|
|
176
190
|
return async (inputs) => {
|
|
177
191
|
const config = modelConfig && modelString ? { modelConfig, model: modelString } : getEvalConfig();
|
|
192
|
+
if (!config.model) {
|
|
193
|
+
throw new Error("model is required for model-based target. Add it to your configureEvals() call.");
|
|
194
|
+
}
|
|
178
195
|
const resolver = new LangchainModelResolver(config.modelConfig);
|
|
179
196
|
const model = resolver.resolve(config.model);
|
|
180
197
|
const toolCallCounts = {};
|
|
@@ -191,7 +208,7 @@ function createEvalTarget(modelConfig, modelString) {
|
|
|
191
208
|
{
|
|
192
209
|
name: mockTool.name,
|
|
193
210
|
description: mockTool.description,
|
|
194
|
-
schema: import_zod.z.object(
|
|
211
|
+
schema: mockTool.schema instanceof import_zod.z.ZodObject ? mockTool.schema : import_zod.z.object(
|
|
195
212
|
Object.fromEntries(
|
|
196
213
|
Object.entries(mockTool.schema).map(([key, val]) => {
|
|
197
214
|
if (typeof val === "string") return [key, import_zod.z.string().describe(val)];
|
|
@@ -208,8 +225,7 @@ function createEvalTarget(modelConfig, modelString) {
|
|
|
208
225
|
if (inputs.systemPrompt) {
|
|
209
226
|
messages.push(new import_messages.SystemMessage(inputs.systemPrompt));
|
|
210
227
|
}
|
|
211
|
-
|
|
212
|
-
messages.push(...convertMessages(inputMessages));
|
|
228
|
+
messages.push(...convertToLangchainMessages(inputs.messages));
|
|
213
229
|
let loopCount = 0;
|
|
214
230
|
while (loopCount < MAX_AGENT_LOOPS) {
|
|
215
231
|
loopCount++;
|
|
@@ -244,16 +260,107 @@ function createEvalTarget(modelConfig, modelString) {
|
|
|
244
260
|
return { messages };
|
|
245
261
|
};
|
|
246
262
|
}
|
|
263
|
+
function agentResultToMessages(inputMessages, result) {
|
|
264
|
+
const messages = convertToLangchainMessages(inputMessages);
|
|
265
|
+
let pendingToolCalls = [];
|
|
266
|
+
for (const block of result.content) {
|
|
267
|
+
if (block.type === "tool_call") {
|
|
268
|
+
const tc = block;
|
|
269
|
+
pendingToolCalls.push({
|
|
270
|
+
id: tc.toolCallId,
|
|
271
|
+
name: tc.name,
|
|
272
|
+
args: tc.input ? JSON.parse(tc.input) : {},
|
|
273
|
+
output: tc.output
|
|
274
|
+
});
|
|
275
|
+
} else if (block.type === "text") {
|
|
276
|
+
if (pendingToolCalls.length > 0) {
|
|
277
|
+
messages.push(
|
|
278
|
+
new import_messages.AIMessage({
|
|
279
|
+
content: "",
|
|
280
|
+
tool_calls: pendingToolCalls.map((tc) => ({ id: tc.id, name: tc.name, args: tc.args }))
|
|
281
|
+
})
|
|
282
|
+
);
|
|
283
|
+
for (const tc of pendingToolCalls) {
|
|
284
|
+
messages.push(new import_messages.ToolMessage({ content: tc.output, tool_call_id: tc.id, name: tc.name }));
|
|
285
|
+
}
|
|
286
|
+
pendingToolCalls = [];
|
|
287
|
+
}
|
|
288
|
+
messages.push(new import_messages.AIMessage(block.output));
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
if (pendingToolCalls.length > 0) {
|
|
292
|
+
messages.push(
|
|
293
|
+
new import_messages.AIMessage({
|
|
294
|
+
content: "",
|
|
295
|
+
tool_calls: pendingToolCalls.map((tc) => ({ id: tc.id, name: tc.name, args: tc.args }))
|
|
296
|
+
})
|
|
297
|
+
);
|
|
298
|
+
for (const tc of pendingToolCalls) {
|
|
299
|
+
messages.push(new import_messages.ToolMessage({ content: tc.output, tool_call_id: tc.id, name: tc.name }));
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
return messages;
|
|
303
|
+
}
|
|
304
|
+
function toolDefsToDefinitions(defs) {
|
|
305
|
+
const callCounts = {};
|
|
306
|
+
return Object.entries(defs).map(([name, def]) => {
|
|
307
|
+
callCounts[name] = 0;
|
|
308
|
+
return {
|
|
309
|
+
name,
|
|
310
|
+
toolKit: "eval-mock",
|
|
311
|
+
description: def.description,
|
|
312
|
+
inputSchema: def.schema instanceof import_zod.z.ZodObject ? def.schema : import_zod.z.object(
|
|
313
|
+
Object.fromEntries(
|
|
314
|
+
Object.entries(def.schema ?? {}).map(([key, val]) => {
|
|
315
|
+
if (typeof val === "string") return [key, import_zod.z.string().describe(val)];
|
|
316
|
+
return [key, import_zod.z.any()];
|
|
317
|
+
})
|
|
318
|
+
)
|
|
319
|
+
),
|
|
320
|
+
exec: async (input) => {
|
|
321
|
+
callCounts[name]++;
|
|
322
|
+
if (typeof def.response === "function") {
|
|
323
|
+
return def.response(
|
|
324
|
+
input,
|
|
325
|
+
callCounts[name]
|
|
326
|
+
);
|
|
327
|
+
}
|
|
328
|
+
return typeof def.response === "string" ? def.response : JSON.stringify(def.response);
|
|
329
|
+
}
|
|
330
|
+
};
|
|
331
|
+
});
|
|
332
|
+
}
|
|
333
|
+
async function runAgentTarget(createTarget, evalMessages, extraToolDefs) {
|
|
334
|
+
const extraTools = Object.keys(extraToolDefs).length > 0 ? toolDefsToDefinitions(extraToolDefs) : [];
|
|
335
|
+
const agent = await createTarget(extraTools);
|
|
336
|
+
const result = await agent.run({
|
|
337
|
+
threadId: `eval_${Date.now()}_${Math.random().toString(36).slice(2)}`,
|
|
338
|
+
messages: evalMessages
|
|
339
|
+
});
|
|
340
|
+
return { messages: agentResultToMessages(evalMessages, result) };
|
|
341
|
+
}
|
|
247
342
|
|
|
248
343
|
// src/eval/suite.ts
|
|
249
344
|
function human(content) {
|
|
250
|
-
return { role: "human", content };
|
|
345
|
+
return { role: "human", content: [{ type: "text", text: content }] };
|
|
251
346
|
}
|
|
252
347
|
function ai(content, toolCalls) {
|
|
253
|
-
return { role: "ai", content, ...toolCalls ? { toolCalls } : {} };
|
|
348
|
+
return { role: "ai", content, ...toolCalls ? { toolCalls: toolCalls.map((name) => ({ name })) } : {} };
|
|
254
349
|
}
|
|
255
|
-
function toolResult(
|
|
256
|
-
return { role: "tool",
|
|
350
|
+
function toolResult(name, output) {
|
|
351
|
+
return { role: "tool", name, output };
|
|
352
|
+
}
|
|
353
|
+
function fromToolSpecs(specs, responses = {}) {
|
|
354
|
+
return Object.fromEntries(
|
|
355
|
+
specs.map((spec) => [
|
|
356
|
+
spec.name,
|
|
357
|
+
{
|
|
358
|
+
description: spec.description,
|
|
359
|
+
schema: spec.inputSchema,
|
|
360
|
+
response: responses[spec.name] ?? ""
|
|
361
|
+
}
|
|
362
|
+
])
|
|
363
|
+
);
|
|
257
364
|
}
|
|
258
365
|
function toMockTools(defs) {
|
|
259
366
|
return Object.entries(defs).map(([name, def]) => ({
|
|
@@ -266,51 +373,67 @@ function toMockTools(defs) {
|
|
|
266
373
|
function toSerializableTools(tools) {
|
|
267
374
|
return tools.map((t) => ({
|
|
268
375
|
...t,
|
|
376
|
+
schema: t.schema instanceof Object && "shape" in t.schema ? "<ZodObject>" : t.schema,
|
|
269
377
|
response: typeof t.response === "function" ? "<function>" : t.response
|
|
270
378
|
}));
|
|
271
379
|
}
|
|
272
380
|
function lastHumanContent(messages) {
|
|
273
381
|
for (let i = messages.length - 1; i >= 0; i--) {
|
|
274
|
-
|
|
382
|
+
const msg = messages[i];
|
|
383
|
+
if (msg.role === "human") {
|
|
384
|
+
const textBlock = msg.content.find((c) => c.type === "text");
|
|
385
|
+
return textBlock ? textBlock.text : "";
|
|
386
|
+
}
|
|
275
387
|
}
|
|
276
|
-
return
|
|
388
|
+
return "";
|
|
277
389
|
}
|
|
278
|
-
function
|
|
390
|
+
function resolveModelTarget(config) {
|
|
279
391
|
if (typeof config.target === "function") return config.target;
|
|
280
392
|
const evalConfig = getEvalConfig();
|
|
393
|
+
if (!evalConfig.model && typeof config.target !== "string") {
|
|
394
|
+
throw new Error("model is required for model-based target. Add it to your configureEvals() call.");
|
|
395
|
+
}
|
|
281
396
|
const model = typeof config.target === "string" ? config.target : evalConfig.model;
|
|
282
397
|
return createEvalTarget(evalConfig.modelConfig, model);
|
|
283
398
|
}
|
|
399
|
+
function resolveCreateTarget(config) {
|
|
400
|
+
return config.createTarget ?? getEvalConfig().createTarget;
|
|
401
|
+
}
|
|
284
402
|
function defineSuite(name, config) {
|
|
285
|
-
const
|
|
286
|
-
const
|
|
287
|
-
const globalPrompt = getEvalConfig().systemPrompt;
|
|
403
|
+
const suiteTools = config.tools ?? {};
|
|
404
|
+
const createTarget = config.target ? void 0 : resolveCreateTarget(config);
|
|
288
405
|
ls.describe(name, () => {
|
|
289
406
|
for (const tc of config.cases) {
|
|
290
407
|
const testName = tc.name ?? lastHumanContent(tc.messages);
|
|
291
|
-
const
|
|
408
|
+
const caseToolDefs = tc.tools ?? suiteTools;
|
|
409
|
+
const tools = toMockTools(caseToolDefs);
|
|
292
410
|
const ctx = { message: lastHumanContent(tc.messages) };
|
|
293
411
|
const resolved = tc.expect.map((exp) => exp(ctx));
|
|
294
412
|
const evaluators = resolved.map((r) => r.evaluator);
|
|
295
413
|
const referenceOutputs = Object.assign({}, ...resolved.map((r) => r.referenceOutputs));
|
|
296
|
-
const systemPrompt = tc.systemPrompt ?? config.systemPrompt ?? globalPrompt;
|
|
297
|
-
const targetInputs = {
|
|
298
|
-
messages: tc.messages,
|
|
299
|
-
tools,
|
|
300
|
-
...systemPrompt ? { systemPrompt } : {}
|
|
301
|
-
};
|
|
302
414
|
ls.test(
|
|
303
415
|
testName,
|
|
304
416
|
{
|
|
305
417
|
inputs: {
|
|
306
418
|
messages: tc.messages,
|
|
307
|
-
tools: toSerializableTools(tools)
|
|
308
|
-
...systemPrompt ? { systemPrompt } : {}
|
|
419
|
+
tools: toSerializableTools(tools)
|
|
309
420
|
},
|
|
310
421
|
referenceOutputs
|
|
311
422
|
},
|
|
312
423
|
async ({ referenceOutputs: refOut }) => {
|
|
313
|
-
|
|
424
|
+
let output;
|
|
425
|
+
if (createTarget) {
|
|
426
|
+
output = await runAgentTarget(createTarget, tc.messages, caseToolDefs);
|
|
427
|
+
} else {
|
|
428
|
+
const target = resolveModelTarget(config);
|
|
429
|
+
const globalPrompt = getEvalConfig().systemPrompt;
|
|
430
|
+
const systemPrompt = tc.systemPrompt ?? config.systemPrompt ?? globalPrompt;
|
|
431
|
+
output = await target({
|
|
432
|
+
messages: tc.messages,
|
|
433
|
+
tools,
|
|
434
|
+
...systemPrompt ? { systemPrompt } : {}
|
|
435
|
+
});
|
|
436
|
+
}
|
|
314
437
|
ls.logOutputs(output);
|
|
315
438
|
for (const evaluator of evaluators) {
|
|
316
439
|
await evaluator({ outputs: output, referenceOutputs: refOut ?? {} });
|
|
@@ -462,7 +585,7 @@ function toolsCalled(tools) {
|
|
|
462
585
|
function llmJudge() {
|
|
463
586
|
return () => {
|
|
464
587
|
const config = getEvalConfig();
|
|
465
|
-
const model = config.evaluatorModel
|
|
588
|
+
const model = config.evaluatorModel;
|
|
466
589
|
return {
|
|
467
590
|
evaluator: ls2.wrapEvaluator(
|
|
468
591
|
withTrajectoryGuard(
|
|
@@ -483,7 +606,7 @@ function noTools() {
|
|
|
483
606
|
function respondsInLanguage(code) {
|
|
484
607
|
return () => {
|
|
485
608
|
const config = getEvalConfig();
|
|
486
|
-
const model = config.evaluatorModel
|
|
609
|
+
const model = config.evaluatorModel;
|
|
487
610
|
return {
|
|
488
611
|
evaluator: ls2.wrapEvaluator(createLanguageEvaluator(config.modelConfig, model)),
|
|
489
612
|
referenceOutputs: { expectedLanguage: code }
|
|
@@ -508,6 +631,7 @@ function notContains(strings) {
|
|
|
508
631
|
configureEvals,
|
|
509
632
|
contains,
|
|
510
633
|
defineSuite,
|
|
634
|
+
fromToolSpecs,
|
|
511
635
|
human,
|
|
512
636
|
llmJudge,
|
|
513
637
|
noTools,
|