@dvina/agents 0.10.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/eval/index.d.mts +39 -9
- package/dist/eval/index.d.ts +39 -9
- package/dist/eval/index.js +151 -72
- package/dist/eval/index.js.map +1 -1
- package/dist/eval/index.mjs +150 -77
- package/dist/eval/index.mjs.map +1 -1
- package/dist/index.d.mts +5 -4
- package/dist/index.d.ts +5 -4
- package/dist/index.js +46 -1
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +51 -2
- package/dist/index.mjs.map +1 -1
- package/dist/{model-resolver-BZtVieXE.d.mts → model-resolver-DjKRXKtu.d.mts} +1 -1
- package/dist/{model-resolver-BZtVieXE.d.ts → model-resolver-DjKRXKtu.d.ts} +1 -1
- package/package.json +1 -1
package/dist/eval/index.d.mts
CHANGED
|
@@ -1,20 +1,22 @@
|
|
|
1
|
-
import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message,
|
|
1
|
+
import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DjKRXKtu.mjs';
|
|
2
2
|
import * as zod from 'zod';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
import { BaseMessage } from '@langchain/core/messages';
|
|
5
5
|
|
|
6
|
-
/** Factory that creates a fresh Agent per test case. Receives extra suite-level tools
|
|
7
|
-
type CreateTargetFn = (extraTools: ToolDefinition[]) => Agent | Promise<Agent>;
|
|
6
|
+
/** Factory that creates a fresh Agent per test case. Receives the model string and extra suite-level tools. */
|
|
7
|
+
type CreateTargetFn = (model: string, extraTools: ToolDefinition[]) => Agent | Promise<Agent>;
|
|
8
8
|
interface EvalConfig {
|
|
9
9
|
/** Required for model-based target and LLM evaluators (respondsInLanguage, llmJudge). */
|
|
10
10
|
modelConfig: LangchainModelConfig;
|
|
11
|
-
/**
|
|
12
|
-
|
|
11
|
+
/** Models to evaluate. Every registered suite is run once per model. */
|
|
12
|
+
models: string[];
|
|
13
13
|
/** Model for evaluators needing LLM calls (language detection, LLM-as-judge). */
|
|
14
14
|
evaluatorModel: string;
|
|
15
|
+
/** LangSmith experiment (dataset) name. All suites share this single experiment for easy comparison across runs. */
|
|
16
|
+
experimentName: string;
|
|
15
17
|
/** System prompt for model-based target. Ignored when createTarget is used. Can be overridden per-suite or per-case. */
|
|
16
18
|
systemPrompt?: string;
|
|
17
|
-
/** Factory that creates a fresh Agent per test case.
|
|
19
|
+
/** Factory that creates a fresh Agent per test case. Receives the current model string from the models array. */
|
|
18
20
|
createTarget?: CreateTargetFn;
|
|
19
21
|
/** Transforms test case messages before sending to target. Simulates production preprocessing (e.g., message enrichment). */
|
|
20
22
|
prepareMessages?: (messages: Message[]) => Message[] | Promise<Message[]>;
|
|
@@ -57,14 +59,26 @@ declare function toolsCalled(tools: string[]): Expectation;
|
|
|
57
59
|
* Uses the globally configured evaluator model.
|
|
58
60
|
*/
|
|
59
61
|
declare function llmJudge(): Expectation;
|
|
60
|
-
/**
|
|
61
|
-
|
|
62
|
+
/**
|
|
63
|
+
* Assert the agent made zero tool calls.
|
|
64
|
+
* Optionally allow specific tools via `except` — calls to those tools
|
|
65
|
+
* are permitted (but not required), while any other tool call fails.
|
|
66
|
+
*/
|
|
67
|
+
declare function noTools(options?: {
|
|
68
|
+
except: string[];
|
|
69
|
+
}): Expectation;
|
|
62
70
|
/**
|
|
63
71
|
* Assert the response is in the given language (ISO 639-1 code).
|
|
64
72
|
* Uses the globally configured evaluator model for language detection.
|
|
65
73
|
* @param code - ISO 639-1 language code (e.g. 'en', 'tr', 'de').
|
|
66
74
|
*/
|
|
67
75
|
declare function respondsInLanguage(code: string): Expectation;
|
|
76
|
+
/**
|
|
77
|
+
* Assert that at least one tool call was made.
|
|
78
|
+
* When `tools` is provided, at least one of those specific tools must
|
|
79
|
+
* appear in the trajectory. When omitted, any tool call satisfies it.
|
|
80
|
+
*/
|
|
81
|
+
declare function anyToolCalled(tools?: string[]): Expectation;
|
|
68
82
|
/** Assert the response contains all given strings. */
|
|
69
83
|
declare function contains(strings: string[]): Expectation;
|
|
70
84
|
/** Assert the response does not contain any of the given strings. */
|
|
@@ -118,6 +132,22 @@ interface SuiteConfig {
|
|
|
118
132
|
* entry in `responses` default to `''`.
|
|
119
133
|
*/
|
|
120
134
|
declare function fromToolSpecs(specs: ToolSpec[], responses?: Record<string, ToolDef['response']>): Record<string, ToolDef>;
|
|
135
|
+
/**
|
|
136
|
+
* Registers an eval suite. Does not create tests on its own — call
|
|
137
|
+
* `runEvals()` after all suites are registered to emit a single
|
|
138
|
+
* LangSmith experiment containing every test case.
|
|
139
|
+
*/
|
|
121
140
|
declare function defineSuite(name: string, config: SuiteConfig): void;
|
|
141
|
+
/**
|
|
142
|
+
* Emits all registered suites under a single `ls.describe` block so
|
|
143
|
+
* every test case lands in one LangSmith experiment / dataset.
|
|
144
|
+
*
|
|
145
|
+
* Call this once, after importing all suite files.
|
|
146
|
+
*
|
|
147
|
+
* Individual suites are grouped with native `describe` blocks for
|
|
148
|
+
* readability; test names are prefixed with the suite name
|
|
149
|
+
* (e.g. "discovery > should use search tool").
|
|
150
|
+
*/
|
|
151
|
+
declare function runEvals(): void;
|
|
122
152
|
|
|
123
|
-
export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
|
|
153
|
+
export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };
|
package/dist/eval/index.d.ts
CHANGED
|
@@ -1,20 +1,22 @@
|
|
|
1
|
-
import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message,
|
|
1
|
+
import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DjKRXKtu.js';
|
|
2
2
|
import * as zod from 'zod';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
import { BaseMessage } from '@langchain/core/messages';
|
|
5
5
|
|
|
6
|
-
/** Factory that creates a fresh Agent per test case. Receives extra suite-level tools
|
|
7
|
-
type CreateTargetFn = (extraTools: ToolDefinition[]) => Agent | Promise<Agent>;
|
|
6
|
+
/** Factory that creates a fresh Agent per test case. Receives the model string and extra suite-level tools. */
|
|
7
|
+
type CreateTargetFn = (model: string, extraTools: ToolDefinition[]) => Agent | Promise<Agent>;
|
|
8
8
|
interface EvalConfig {
|
|
9
9
|
/** Required for model-based target and LLM evaluators (respondsInLanguage, llmJudge). */
|
|
10
10
|
modelConfig: LangchainModelConfig;
|
|
11
|
-
/**
|
|
12
|
-
|
|
11
|
+
/** Models to evaluate. Every registered suite is run once per model. */
|
|
12
|
+
models: string[];
|
|
13
13
|
/** Model for evaluators needing LLM calls (language detection, LLM-as-judge). */
|
|
14
14
|
evaluatorModel: string;
|
|
15
|
+
/** LangSmith experiment (dataset) name. All suites share this single experiment for easy comparison across runs. */
|
|
16
|
+
experimentName: string;
|
|
15
17
|
/** System prompt for model-based target. Ignored when createTarget is used. Can be overridden per-suite or per-case. */
|
|
16
18
|
systemPrompt?: string;
|
|
17
|
-
/** Factory that creates a fresh Agent per test case.
|
|
19
|
+
/** Factory that creates a fresh Agent per test case. Receives the current model string from the models array. */
|
|
18
20
|
createTarget?: CreateTargetFn;
|
|
19
21
|
/** Transforms test case messages before sending to target. Simulates production preprocessing (e.g., message enrichment). */
|
|
20
22
|
prepareMessages?: (messages: Message[]) => Message[] | Promise<Message[]>;
|
|
@@ -57,14 +59,26 @@ declare function toolsCalled(tools: string[]): Expectation;
|
|
|
57
59
|
* Uses the globally configured evaluator model.
|
|
58
60
|
*/
|
|
59
61
|
declare function llmJudge(): Expectation;
|
|
60
|
-
/**
|
|
61
|
-
|
|
62
|
+
/**
|
|
63
|
+
* Assert the agent made zero tool calls.
|
|
64
|
+
* Optionally allow specific tools via `except` — calls to those tools
|
|
65
|
+
* are permitted (but not required), while any other tool call fails.
|
|
66
|
+
*/
|
|
67
|
+
declare function noTools(options?: {
|
|
68
|
+
except: string[];
|
|
69
|
+
}): Expectation;
|
|
62
70
|
/**
|
|
63
71
|
* Assert the response is in the given language (ISO 639-1 code).
|
|
64
72
|
* Uses the globally configured evaluator model for language detection.
|
|
65
73
|
* @param code - ISO 639-1 language code (e.g. 'en', 'tr', 'de').
|
|
66
74
|
*/
|
|
67
75
|
declare function respondsInLanguage(code: string): Expectation;
|
|
76
|
+
/**
|
|
77
|
+
* Assert that at least one tool call was made.
|
|
78
|
+
* When `tools` is provided, at least one of those specific tools must
|
|
79
|
+
* appear in the trajectory. When omitted, any tool call satisfies it.
|
|
80
|
+
*/
|
|
81
|
+
declare function anyToolCalled(tools?: string[]): Expectation;
|
|
68
82
|
/** Assert the response contains all given strings. */
|
|
69
83
|
declare function contains(strings: string[]): Expectation;
|
|
70
84
|
/** Assert the response does not contain any of the given strings. */
|
|
@@ -118,6 +132,22 @@ interface SuiteConfig {
|
|
|
118
132
|
* entry in `responses` default to `''`.
|
|
119
133
|
*/
|
|
120
134
|
declare function fromToolSpecs(specs: ToolSpec[], responses?: Record<string, ToolDef['response']>): Record<string, ToolDef>;
|
|
135
|
+
/**
|
|
136
|
+
* Registers an eval suite. Does not create tests on its own — call
|
|
137
|
+
* `runEvals()` after all suites are registered to emit a single
|
|
138
|
+
* LangSmith experiment containing every test case.
|
|
139
|
+
*/
|
|
121
140
|
declare function defineSuite(name: string, config: SuiteConfig): void;
|
|
141
|
+
/**
|
|
142
|
+
* Emits all registered suites under a single `ls.describe` block so
|
|
143
|
+
* every test case lands in one LangSmith experiment / dataset.
|
|
144
|
+
*
|
|
145
|
+
* Call this once, after importing all suite files.
|
|
146
|
+
*
|
|
147
|
+
* Individual suites are grouped with native `describe` blocks for
|
|
148
|
+
* readability; test names are prefixed with the suite name
|
|
149
|
+
* (e.g. "discovery > should use search tool").
|
|
150
|
+
*/
|
|
151
|
+
declare function runEvals(): void;
|
|
122
152
|
|
|
123
|
-
export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
|
|
153
|
+
export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };
|
package/dist/eval/index.js
CHANGED
|
@@ -31,6 +31,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
31
31
|
var eval_exports = {};
|
|
32
32
|
__export(eval_exports, {
|
|
33
33
|
ai: () => ai,
|
|
34
|
+
anyToolCalled: () => anyToolCalled,
|
|
34
35
|
configureEvals: () => configureEvals,
|
|
35
36
|
contains: () => contains,
|
|
36
37
|
defineSuite: () => defineSuite,
|
|
@@ -40,6 +41,7 @@ __export(eval_exports, {
|
|
|
40
41
|
noTools: () => noTools,
|
|
41
42
|
notContains: () => notContains,
|
|
42
43
|
respondsInLanguage: () => respondsInLanguage,
|
|
44
|
+
runEvals: () => runEvals,
|
|
43
45
|
toolResult: () => toolResult,
|
|
44
46
|
toolsCalled: () => toolsCalled
|
|
45
47
|
});
|
|
@@ -59,6 +61,7 @@ function getEvalConfig() {
|
|
|
59
61
|
|
|
60
62
|
// src/eval/suite.ts
|
|
61
63
|
var ls = __toESM(require("langsmith/vitest"));
|
|
64
|
+
var import_messages2 = require("@langchain/core/messages");
|
|
62
65
|
|
|
63
66
|
// src/eval/target.ts
|
|
64
67
|
var import_tools = require("@langchain/core/tools");
|
|
@@ -246,14 +249,24 @@ function convertToLangchainMessages(messages) {
|
|
|
246
249
|
|
|
247
250
|
// src/eval/target.ts
|
|
248
251
|
var MAX_AGENT_LOOPS = 10;
|
|
252
|
+
function stripReasoningBlocks(message) {
|
|
253
|
+
if (!Array.isArray(message.content)) return message;
|
|
254
|
+
const filtered = message.content.filter(
|
|
255
|
+
(block) => block.type !== "reasoning" && block.type !== "thinking"
|
|
256
|
+
);
|
|
257
|
+
const newContent = filtered.length > 0 ? filtered : "";
|
|
258
|
+
return new import_messages.AIMessage({
|
|
259
|
+
content: newContent,
|
|
260
|
+
tool_calls: message.tool_calls,
|
|
261
|
+
id: message.id,
|
|
262
|
+
response_metadata: message.response_metadata,
|
|
263
|
+
usage_metadata: message.usage_metadata
|
|
264
|
+
});
|
|
265
|
+
}
|
|
249
266
|
function createEvalTarget(modelConfig, modelString) {
|
|
250
267
|
return async (inputs) => {
|
|
251
|
-
const
|
|
252
|
-
|
|
253
|
-
throw new Error("model is required for model-based target. Add it to your configureEvals() call.");
|
|
254
|
-
}
|
|
255
|
-
const resolver = new LangchainModelResolver(config.modelConfig);
|
|
256
|
-
const model = resolver.resolve(config.model);
|
|
268
|
+
const resolver = new LangchainModelResolver(modelConfig);
|
|
269
|
+
const model = resolver.resolve(modelString);
|
|
257
270
|
const toolCallCounts = {};
|
|
258
271
|
const langchainTools = inputs.tools.map((mockTool) => {
|
|
259
272
|
toolCallCounts[mockTool.name] = 0;
|
|
@@ -290,7 +303,7 @@ function createEvalTarget(modelConfig, modelString) {
|
|
|
290
303
|
while (loopCount < MAX_AGENT_LOOPS) {
|
|
291
304
|
loopCount++;
|
|
292
305
|
const response = await boundModel.invoke(messages);
|
|
293
|
-
messages.push(response);
|
|
306
|
+
messages.push(stripReasoningBlocks(response));
|
|
294
307
|
const aiMessage = response;
|
|
295
308
|
if (!aiMessage.tool_calls || aiMessage.tool_calls.length === 0) {
|
|
296
309
|
break;
|
|
@@ -390,9 +403,9 @@ function toolDefsToDefinitions(defs) {
|
|
|
390
403
|
};
|
|
391
404
|
});
|
|
392
405
|
}
|
|
393
|
-
async function runAgentTarget(createTarget, evalMessages, extraToolDefs) {
|
|
406
|
+
async function runAgentTarget(createTarget, model, evalMessages, extraToolDefs) {
|
|
394
407
|
const extraTools = Object.keys(extraToolDefs).length > 0 ? toolDefsToDefinitions(extraToolDefs) : [];
|
|
395
|
-
const agent = await createTarget(extraTools);
|
|
408
|
+
const agent = await createTarget(model, extraTools);
|
|
396
409
|
const result = await agent.run({
|
|
397
410
|
threadId: `eval_${Date.now()}_${Math.random().toString(36).slice(2)}`,
|
|
398
411
|
messages: evalMessages
|
|
@@ -430,13 +443,6 @@ function toMockTools(defs) {
|
|
|
430
443
|
response: typeof def.response === "function" ? def.response : typeof def.response === "string" ? def.response : JSON.stringify(def.response)
|
|
431
444
|
}));
|
|
432
445
|
}
|
|
433
|
-
function toSerializableTools(tools) {
|
|
434
|
-
return tools.map((t) => ({
|
|
435
|
-
...t,
|
|
436
|
-
schema: t.schema instanceof Object && "shape" in t.schema ? "<ZodObject>" : t.schema,
|
|
437
|
-
response: typeof t.response === "function" ? "<function>" : t.response
|
|
438
|
-
}));
|
|
439
|
-
}
|
|
440
446
|
function lastHumanContent(messages) {
|
|
441
447
|
for (let i = messages.length - 1; i >= 0; i--) {
|
|
442
448
|
const msg = messages[i];
|
|
@@ -447,61 +453,80 @@ function lastHumanContent(messages) {
|
|
|
447
453
|
}
|
|
448
454
|
return "";
|
|
449
455
|
}
|
|
450
|
-
function resolveModelTarget(config) {
|
|
456
|
+
function resolveModelTarget(config, model) {
|
|
451
457
|
if (typeof config.target === "function") return config.target;
|
|
452
458
|
const evalConfig = getEvalConfig();
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
}
|
|
456
|
-
const model = typeof config.target === "string" ? config.target : evalConfig.model;
|
|
457
|
-
return createEvalTarget(evalConfig.modelConfig, model);
|
|
459
|
+
const targetModel = typeof config.target === "string" ? config.target : model;
|
|
460
|
+
return createEvalTarget(evalConfig.modelConfig, targetModel);
|
|
458
461
|
}
|
|
459
462
|
function resolveCreateTarget(config) {
|
|
460
463
|
return config.createTarget ?? getEvalConfig().createTarget;
|
|
461
464
|
}
|
|
465
|
+
var _suites = [];
|
|
462
466
|
function defineSuite(name, config) {
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
const
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
referenceOutputs
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
467
|
+
_suites.push({ name, config });
|
|
468
|
+
}
|
|
469
|
+
function runEvals() {
|
|
470
|
+
const evalConfig = getEvalConfig();
|
|
471
|
+
ls.describe(evalConfig.experimentName, () => {
|
|
472
|
+
for (const currentModel of evalConfig.models) {
|
|
473
|
+
for (const { name: suiteName, config } of _suites) {
|
|
474
|
+
const suiteTools = config.tools ?? {};
|
|
475
|
+
const createTarget = config.target ? void 0 : resolveCreateTarget(config);
|
|
476
|
+
const categoryLabel = suiteName.charAt(0).toUpperCase() + suiteName.slice(1);
|
|
477
|
+
const model = typeof config.target === "string" ? config.target : currentModel;
|
|
478
|
+
for (const tc of config.cases) {
|
|
479
|
+
const testName = tc.name ?? lastHumanContent(tc.messages);
|
|
480
|
+
const caseToolDefs = tc.tools ?? suiteTools;
|
|
481
|
+
const tools = toMockTools(caseToolDefs);
|
|
482
|
+
const ctx = { message: lastHumanContent(tc.messages) };
|
|
483
|
+
const resolved = tc.expect.map((exp) => exp(ctx));
|
|
484
|
+
const evaluators = resolved.map((r) => r.evaluator);
|
|
485
|
+
const referenceOutputs = Object.assign({}, ...resolved.map((r) => r.referenceOutputs));
|
|
486
|
+
const fullTestName = `[${categoryLabel}] > ${testName}`;
|
|
487
|
+
ls.test(
|
|
488
|
+
`${fullTestName} (${model})`,
|
|
489
|
+
{
|
|
490
|
+
inputs: {
|
|
491
|
+
name: fullTestName,
|
|
492
|
+
category: categoryLabel,
|
|
493
|
+
model,
|
|
494
|
+
messages: tc.messages
|
|
495
|
+
},
|
|
496
|
+
referenceOutputs
|
|
497
|
+
},
|
|
498
|
+
async ({ referenceOutputs: refOut }) => {
|
|
499
|
+
let output;
|
|
500
|
+
const prepareMessages = tc.prepareMessages ?? config.prepareMessages ?? getEvalConfig().prepareMessages;
|
|
501
|
+
const preparedMessages = prepareMessages ? await prepareMessages(tc.messages) : tc.messages;
|
|
502
|
+
if (createTarget) {
|
|
503
|
+
output = await runAgentTarget(
|
|
504
|
+
createTarget,
|
|
505
|
+
currentModel,
|
|
506
|
+
preparedMessages,
|
|
507
|
+
caseToolDefs
|
|
508
|
+
);
|
|
509
|
+
} else {
|
|
510
|
+
const target = resolveModelTarget(config, currentModel);
|
|
511
|
+
const globalPrompt = getEvalConfig().systemPrompt;
|
|
512
|
+
const systemPrompt = tc.systemPrompt ?? config.systemPrompt ?? globalPrompt;
|
|
513
|
+
output = await target({
|
|
514
|
+
messages: preparedMessages,
|
|
515
|
+
tools,
|
|
516
|
+
...systemPrompt ? { systemPrompt } : {}
|
|
517
|
+
});
|
|
518
|
+
}
|
|
519
|
+
const calledTools = output.messages.filter((m) => m instanceof import_messages2.AIMessage).flatMap((m) => m.tool_calls ?? []).map((tc2) => tc2.name);
|
|
520
|
+
ls.logOutputs({
|
|
521
|
+
tools_called: calledTools.length > 0 ? calledTools.join(" | ") : "none"
|
|
522
|
+
});
|
|
523
|
+
for (const evaluator of evaluators) {
|
|
524
|
+
await evaluator({ outputs: output, referenceOutputs: refOut ?? {} });
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
);
|
|
503
528
|
}
|
|
504
|
-
|
|
529
|
+
}
|
|
505
530
|
}
|
|
506
531
|
});
|
|
507
532
|
}
|
|
@@ -511,7 +536,7 @@ var ls2 = __toESM(require("langsmith/vitest"));
|
|
|
511
536
|
var import_agentevals = require("agentevals");
|
|
512
537
|
|
|
513
538
|
// src/eval/evaluators/language.ts
|
|
514
|
-
var
|
|
539
|
+
var import_messages3 = require("@langchain/core/messages");
|
|
515
540
|
function createLanguageEvaluator(modelConfig, model) {
|
|
516
541
|
const resolver = new LangchainModelResolver(modelConfig);
|
|
517
542
|
const judge = resolver.resolve(model);
|
|
@@ -524,7 +549,7 @@ function createLanguageEvaluator(modelConfig, model) {
|
|
|
524
549
|
return { key: "language_match", score: true, comment: "No expected language specified, skipping" };
|
|
525
550
|
}
|
|
526
551
|
const messages = outputs.messages || [];
|
|
527
|
-
const lastAiMessage = [...messages].reverse().find((m) => m instanceof
|
|
552
|
+
const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages3.AIMessage);
|
|
528
553
|
if (!lastAiMessage) {
|
|
529
554
|
return { key: "language_match", score: false, comment: "No AI message found in trajectory" };
|
|
530
555
|
}
|
|
@@ -550,7 +575,7 @@ function createLanguageEvaluator(modelConfig, model) {
|
|
|
550
575
|
}
|
|
551
576
|
|
|
552
577
|
// src/eval/evaluators/response-content.ts
|
|
553
|
-
var
|
|
578
|
+
var import_messages4 = require("@langchain/core/messages");
|
|
554
579
|
function createResponseContentEvaluator() {
|
|
555
580
|
return async ({
|
|
556
581
|
outputs,
|
|
@@ -562,7 +587,7 @@ function createResponseContentEvaluator() {
|
|
|
562
587
|
return { key: "response_content", score: true, comment: "No content assertions specified, skipping" };
|
|
563
588
|
}
|
|
564
589
|
const messages = outputs.messages || [];
|
|
565
|
-
const lastAiMessage = [...messages].reverse().find((m) => m instanceof
|
|
590
|
+
const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages4.AIMessage);
|
|
566
591
|
if (!lastAiMessage) {
|
|
567
592
|
return { key: "response_content", score: false, comment: "No AI message found in trajectory" };
|
|
568
593
|
}
|
|
@@ -588,7 +613,7 @@ function createResponseContentEvaluator() {
|
|
|
588
613
|
}
|
|
589
614
|
|
|
590
615
|
// src/eval/evaluators/no-tool-calls.ts
|
|
591
|
-
var
|
|
616
|
+
var import_messages5 = require("@langchain/core/messages");
|
|
592
617
|
function createNoToolCallsEvaluator() {
|
|
593
618
|
return async ({
|
|
594
619
|
outputs,
|
|
@@ -598,8 +623,17 @@ function createNoToolCallsEvaluator() {
|
|
|
598
623
|
return { key: "no_tool_calls", score: true, comment: "No tool call restriction specified, skipping" };
|
|
599
624
|
}
|
|
600
625
|
const messages = outputs.messages || [];
|
|
601
|
-
const
|
|
602
|
-
const
|
|
626
|
+
const exceptTools = referenceOutputs?.exceptTools ?? [];
|
|
627
|
+
const toolCalls = messages.filter((m) => m instanceof import_messages5.AIMessage).flatMap((m) => m.tool_calls || []);
|
|
628
|
+
const disallowedCalls = exceptTools.length > 0 ? toolCalls.filter((tc) => !exceptTools.includes(tc.name)) : toolCalls;
|
|
629
|
+
const passed = disallowedCalls.length === 0;
|
|
630
|
+
if (exceptTools.length > 0) {
|
|
631
|
+
return {
|
|
632
|
+
key: "no_tool_calls",
|
|
633
|
+
score: passed,
|
|
634
|
+
comment: passed ? `No disallowed tool calls made (allowed: ${exceptTools.join(", ")})` : `Agent made ${disallowedCalls.length} disallowed tool call(s): ${disallowedCalls.map((tc) => tc.name).join(", ")}`
|
|
635
|
+
};
|
|
636
|
+
}
|
|
603
637
|
return {
|
|
604
638
|
key: "no_tool_calls",
|
|
605
639
|
score: passed,
|
|
@@ -608,6 +642,37 @@ function createNoToolCallsEvaluator() {
|
|
|
608
642
|
};
|
|
609
643
|
}
|
|
610
644
|
|
|
645
|
+
// src/eval/evaluators/any-tool-called.ts
|
|
646
|
+
var import_messages6 = require("@langchain/core/messages");
|
|
647
|
+
function createAnyToolCalledEvaluator() {
|
|
648
|
+
return async ({
|
|
649
|
+
outputs,
|
|
650
|
+
referenceOutputs
|
|
651
|
+
}) => {
|
|
652
|
+
if (referenceOutputs?.expectAnyToolCall !== true) {
|
|
653
|
+
return { key: "any_tool_called", score: true, comment: "No any-tool-call expectation specified, skipping" };
|
|
654
|
+
}
|
|
655
|
+
const expectedTools = referenceOutputs?.anyToolsExpected ?? [];
|
|
656
|
+
const messages = outputs.messages || [];
|
|
657
|
+
const calledToolNames = messages.filter((m) => m instanceof import_messages6.AIMessage).flatMap((m) => m.tool_calls || []).map((tc) => tc.name);
|
|
658
|
+
if (expectedTools.length === 0) {
|
|
659
|
+
const passed2 = calledToolNames.length > 0;
|
|
660
|
+
return {
|
|
661
|
+
key: "any_tool_called",
|
|
662
|
+
score: passed2,
|
|
663
|
+
comment: passed2 ? `Agent called tool(s): ${calledToolNames.join(", ")}` : "Agent made no tool calls (expected at least one)"
|
|
664
|
+
};
|
|
665
|
+
}
|
|
666
|
+
const matchedTools = expectedTools.filter((name) => calledToolNames.includes(name));
|
|
667
|
+
const passed = matchedTools.length > 0;
|
|
668
|
+
return {
|
|
669
|
+
key: "any_tool_called",
|
|
670
|
+
score: passed,
|
|
671
|
+
comment: passed ? `Called expected tool(s): ${matchedTools.join(", ")}` : `None of the expected tools were called (expected one of: ${expectedTools.join(", ")}; actual: ${calledToolNames.length > 0 ? calledToolNames.join(", ") : "none"})`
|
|
672
|
+
};
|
|
673
|
+
};
|
|
674
|
+
}
|
|
675
|
+
|
|
611
676
|
// src/eval/expectations.ts
|
|
612
677
|
function withTrajectoryGuard(evaluator, key) {
|
|
613
678
|
return async ({ outputs, referenceOutputs }) => {
|
|
@@ -659,10 +724,13 @@ function llmJudge() {
|
|
|
659
724
|
};
|
|
660
725
|
};
|
|
661
726
|
}
|
|
662
|
-
function noTools() {
|
|
727
|
+
function noTools(options) {
|
|
663
728
|
return () => ({
|
|
664
729
|
evaluator: ls2.wrapEvaluator(createNoToolCallsEvaluator()),
|
|
665
|
-
referenceOutputs: {
|
|
730
|
+
referenceOutputs: {
|
|
731
|
+
expectNoToolCalls: true,
|
|
732
|
+
...options?.except?.length ? { exceptTools: options.except } : {}
|
|
733
|
+
}
|
|
666
734
|
});
|
|
667
735
|
}
|
|
668
736
|
function respondsInLanguage(code) {
|
|
@@ -675,6 +743,15 @@ function respondsInLanguage(code) {
|
|
|
675
743
|
};
|
|
676
744
|
};
|
|
677
745
|
}
|
|
746
|
+
function anyToolCalled(tools) {
|
|
747
|
+
return () => ({
|
|
748
|
+
evaluator: ls2.wrapEvaluator(createAnyToolCalledEvaluator()),
|
|
749
|
+
referenceOutputs: {
|
|
750
|
+
expectAnyToolCall: true,
|
|
751
|
+
...tools?.length ? { anyToolsExpected: tools } : {}
|
|
752
|
+
}
|
|
753
|
+
});
|
|
754
|
+
}
|
|
678
755
|
function contains(strings) {
|
|
679
756
|
return () => ({
|
|
680
757
|
evaluator: ls2.wrapEvaluator(createResponseContentEvaluator()),
|
|
@@ -690,6 +767,7 @@ function notContains(strings) {
|
|
|
690
767
|
// Annotate the CommonJS export names for ESM import in node:
|
|
691
768
|
0 && (module.exports = {
|
|
692
769
|
ai,
|
|
770
|
+
anyToolCalled,
|
|
693
771
|
configureEvals,
|
|
694
772
|
contains,
|
|
695
773
|
defineSuite,
|
|
@@ -699,6 +777,7 @@ function notContains(strings) {
|
|
|
699
777
|
noTools,
|
|
700
778
|
notContains,
|
|
701
779
|
respondsInLanguage,
|
|
780
|
+
runEvals,
|
|
702
781
|
toolResult,
|
|
703
782
|
toolsCalled
|
|
704
783
|
});
|