@dvina/agents 0.9.2 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/eval/index.d.mts +34 -4
- package/dist/eval/index.d.ts +34 -4
- package/dist/eval/index.js +121 -55
- package/dist/eval/index.js.map +1 -1
- package/dist/eval/index.mjs +120 -60
- package/dist/eval/index.mjs.map +1 -1
- package/dist/index.d.mts +15 -12
- package/dist/index.d.ts +15 -12
- package/dist/index.js +42 -17
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +42 -17
- package/dist/index.mjs.map +1 -1
- package/dist/{model-resolver-lIpXv0Pc.d.mts → model-resolver-DjKRXKtu.d.mts} +7 -1
- package/dist/{model-resolver-lIpXv0Pc.d.ts → model-resolver-DjKRXKtu.d.ts} +7 -1
- package/package.json +1 -1
package/dist/eval/index.d.mts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message,
|
|
1
|
+
import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DjKRXKtu.mjs';
|
|
2
2
|
import * as zod from 'zod';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
import { BaseMessage } from '@langchain/core/messages';
|
|
@@ -12,6 +12,8 @@ interface EvalConfig {
|
|
|
12
12
|
model?: string;
|
|
13
13
|
/** Model for evaluators needing LLM calls (language detection, LLM-as-judge). */
|
|
14
14
|
evaluatorModel: string;
|
|
15
|
+
/** LangSmith experiment (dataset) name. All suites share this single experiment for easy comparison across runs. */
|
|
16
|
+
experimentName: string;
|
|
15
17
|
/** System prompt for model-based target. Ignored when createTarget is used. Can be overridden per-suite or per-case. */
|
|
16
18
|
systemPrompt?: string;
|
|
17
19
|
/** Factory that creates a fresh Agent per test case. When set, this is the default target. */
|
|
@@ -57,14 +59,26 @@ declare function toolsCalled(tools: string[]): Expectation;
|
|
|
57
59
|
* Uses the globally configured evaluator model.
|
|
58
60
|
*/
|
|
59
61
|
declare function llmJudge(): Expectation;
|
|
60
|
-
/**
|
|
61
|
-
|
|
62
|
+
/**
|
|
63
|
+
* Assert the agent made zero tool calls.
|
|
64
|
+
* Optionally allow specific tools via `except` — calls to those tools
|
|
65
|
+
* are permitted (but not required), while any other tool call fails.
|
|
66
|
+
*/
|
|
67
|
+
declare function noTools(options?: {
|
|
68
|
+
except: string[];
|
|
69
|
+
}): Expectation;
|
|
62
70
|
/**
|
|
63
71
|
* Assert the response is in the given language (ISO 639-1 code).
|
|
64
72
|
* Uses the globally configured evaluator model for language detection.
|
|
65
73
|
* @param code - ISO 639-1 language code (e.g. 'en', 'tr', 'de').
|
|
66
74
|
*/
|
|
67
75
|
declare function respondsInLanguage(code: string): Expectation;
|
|
76
|
+
/**
|
|
77
|
+
* Assert that at least one tool call was made.
|
|
78
|
+
* When `tools` is provided, at least one of those specific tools must
|
|
79
|
+
* appear in the trajectory. When omitted, any tool call satisfies it.
|
|
80
|
+
*/
|
|
81
|
+
declare function anyToolCalled(tools?: string[]): Expectation;
|
|
68
82
|
/** Assert the response contains all given strings. */
|
|
69
83
|
declare function contains(strings: string[]): Expectation;
|
|
70
84
|
/** Assert the response does not contain any of the given strings. */
|
|
@@ -118,6 +132,22 @@ interface SuiteConfig {
|
|
|
118
132
|
* entry in `responses` default to `''`.
|
|
119
133
|
*/
|
|
120
134
|
declare function fromToolSpecs(specs: ToolSpec[], responses?: Record<string, ToolDef['response']>): Record<string, ToolDef>;
|
|
135
|
+
/**
|
|
136
|
+
* Registers an eval suite. Does not create tests on its own — call
|
|
137
|
+
* `runEvals()` after all suites are registered to emit a single
|
|
138
|
+
* LangSmith experiment containing every test case.
|
|
139
|
+
*/
|
|
121
140
|
declare function defineSuite(name: string, config: SuiteConfig): void;
|
|
141
|
+
/**
|
|
142
|
+
* Emits all registered suites under a single `ls.describe` block so
|
|
143
|
+
* every test case lands in one LangSmith experiment / dataset.
|
|
144
|
+
*
|
|
145
|
+
* Call this once, after importing all suite files.
|
|
146
|
+
*
|
|
147
|
+
* Individual suites are grouped with native `describe` blocks for
|
|
148
|
+
* readability; test names are prefixed with the suite name
|
|
149
|
+
* (e.g. "discovery > should use search tool").
|
|
150
|
+
*/
|
|
151
|
+
declare function runEvals(): void;
|
|
122
152
|
|
|
123
|
-
export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
|
|
153
|
+
export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };
|
package/dist/eval/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message,
|
|
1
|
+
import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DjKRXKtu.js';
|
|
2
2
|
import * as zod from 'zod';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
import { BaseMessage } from '@langchain/core/messages';
|
|
@@ -12,6 +12,8 @@ interface EvalConfig {
|
|
|
12
12
|
model?: string;
|
|
13
13
|
/** Model for evaluators needing LLM calls (language detection, LLM-as-judge). */
|
|
14
14
|
evaluatorModel: string;
|
|
15
|
+
/** LangSmith experiment (dataset) name. All suites share this single experiment for easy comparison across runs. */
|
|
16
|
+
experimentName: string;
|
|
15
17
|
/** System prompt for model-based target. Ignored when createTarget is used. Can be overridden per-suite or per-case. */
|
|
16
18
|
systemPrompt?: string;
|
|
17
19
|
/** Factory that creates a fresh Agent per test case. When set, this is the default target. */
|
|
@@ -57,14 +59,26 @@ declare function toolsCalled(tools: string[]): Expectation;
|
|
|
57
59
|
* Uses the globally configured evaluator model.
|
|
58
60
|
*/
|
|
59
61
|
declare function llmJudge(): Expectation;
|
|
60
|
-
/**
|
|
61
|
-
|
|
62
|
+
/**
|
|
63
|
+
* Assert the agent made zero tool calls.
|
|
64
|
+
* Optionally allow specific tools via `except` — calls to those tools
|
|
65
|
+
* are permitted (but not required), while any other tool call fails.
|
|
66
|
+
*/
|
|
67
|
+
declare function noTools(options?: {
|
|
68
|
+
except: string[];
|
|
69
|
+
}): Expectation;
|
|
62
70
|
/**
|
|
63
71
|
* Assert the response is in the given language (ISO 639-1 code).
|
|
64
72
|
* Uses the globally configured evaluator model for language detection.
|
|
65
73
|
* @param code - ISO 639-1 language code (e.g. 'en', 'tr', 'de').
|
|
66
74
|
*/
|
|
67
75
|
declare function respondsInLanguage(code: string): Expectation;
|
|
76
|
+
/**
|
|
77
|
+
* Assert that at least one tool call was made.
|
|
78
|
+
* When `tools` is provided, at least one of those specific tools must
|
|
79
|
+
* appear in the trajectory. When omitted, any tool call satisfies it.
|
|
80
|
+
*/
|
|
81
|
+
declare function anyToolCalled(tools?: string[]): Expectation;
|
|
68
82
|
/** Assert the response contains all given strings. */
|
|
69
83
|
declare function contains(strings: string[]): Expectation;
|
|
70
84
|
/** Assert the response does not contain any of the given strings. */
|
|
@@ -118,6 +132,22 @@ interface SuiteConfig {
|
|
|
118
132
|
* entry in `responses` default to `''`.
|
|
119
133
|
*/
|
|
120
134
|
declare function fromToolSpecs(specs: ToolSpec[], responses?: Record<string, ToolDef['response']>): Record<string, ToolDef>;
|
|
135
|
+
/**
|
|
136
|
+
* Registers an eval suite. Does not create tests on its own — call
|
|
137
|
+
* `runEvals()` after all suites are registered to emit a single
|
|
138
|
+
* LangSmith experiment containing every test case.
|
|
139
|
+
*/
|
|
121
140
|
declare function defineSuite(name: string, config: SuiteConfig): void;
|
|
141
|
+
/**
|
|
142
|
+
* Emits all registered suites under a single `ls.describe` block so
|
|
143
|
+
* every test case lands in one LangSmith experiment / dataset.
|
|
144
|
+
*
|
|
145
|
+
* Call this once, after importing all suite files.
|
|
146
|
+
*
|
|
147
|
+
* Individual suites are grouped with native `describe` blocks for
|
|
148
|
+
* readability; test names are prefixed with the suite name
|
|
149
|
+
* (e.g. "discovery > should use search tool").
|
|
150
|
+
*/
|
|
151
|
+
declare function runEvals(): void;
|
|
122
152
|
|
|
123
|
-
export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
|
|
153
|
+
export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };
|
package/dist/eval/index.js
CHANGED
|
@@ -31,6 +31,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
31
31
|
var eval_exports = {};
|
|
32
32
|
__export(eval_exports, {
|
|
33
33
|
ai: () => ai,
|
|
34
|
+
anyToolCalled: () => anyToolCalled,
|
|
34
35
|
configureEvals: () => configureEvals,
|
|
35
36
|
contains: () => contains,
|
|
36
37
|
defineSuite: () => defineSuite,
|
|
@@ -40,6 +41,7 @@ __export(eval_exports, {
|
|
|
40
41
|
noTools: () => noTools,
|
|
41
42
|
notContains: () => notContains,
|
|
42
43
|
respondsInLanguage: () => respondsInLanguage,
|
|
44
|
+
runEvals: () => runEvals,
|
|
43
45
|
toolResult: () => toolResult,
|
|
44
46
|
toolsCalled: () => toolsCalled
|
|
45
47
|
});
|
|
@@ -59,6 +61,7 @@ function getEvalConfig() {
|
|
|
59
61
|
|
|
60
62
|
// src/eval/suite.ts
|
|
61
63
|
var ls = __toESM(require("langsmith/vitest"));
|
|
64
|
+
var import_messages2 = require("@langchain/core/messages");
|
|
62
65
|
|
|
63
66
|
// src/eval/target.ts
|
|
64
67
|
var import_tools = require("@langchain/core/tools");
|
|
@@ -430,13 +433,6 @@ function toMockTools(defs) {
|
|
|
430
433
|
response: typeof def.response === "function" ? def.response : typeof def.response === "string" ? def.response : JSON.stringify(def.response)
|
|
431
434
|
}));
|
|
432
435
|
}
|
|
433
|
-
function toSerializableTools(tools) {
|
|
434
|
-
return tools.map((t) => ({
|
|
435
|
-
...t,
|
|
436
|
-
schema: t.schema instanceof Object && "shape" in t.schema ? "<ZodObject>" : t.schema,
|
|
437
|
-
response: typeof t.response === "function" ? "<function>" : t.response
|
|
438
|
-
}));
|
|
439
|
-
}
|
|
440
436
|
function lastHumanContent(messages) {
|
|
441
437
|
for (let i = messages.length - 1; i >= 0; i--) {
|
|
442
438
|
const msg = messages[i];
|
|
@@ -459,49 +455,65 @@ function resolveModelTarget(config) {
|
|
|
459
455
|
function resolveCreateTarget(config) {
|
|
460
456
|
return config.createTarget ?? getEvalConfig().createTarget;
|
|
461
457
|
}
|
|
458
|
+
var _suites = [];
|
|
462
459
|
function defineSuite(name, config) {
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
const
|
|
470
|
-
const
|
|
471
|
-
const
|
|
472
|
-
const
|
|
473
|
-
const
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
460
|
+
_suites.push({ name, config });
|
|
461
|
+
}
|
|
462
|
+
function runEvals() {
|
|
463
|
+
const evalConfig = getEvalConfig();
|
|
464
|
+
ls.describe(evalConfig.experimentName, () => {
|
|
465
|
+
for (const { name: suiteName, config } of _suites) {
|
|
466
|
+
const suiteTools = config.tools ?? {};
|
|
467
|
+
const createTarget = config.target ? void 0 : resolveCreateTarget(config);
|
|
468
|
+
const categoryLabel = suiteName.charAt(0).toUpperCase() + suiteName.slice(1);
|
|
469
|
+
const model = typeof config.target === "string" ? config.target : evalConfig.model ?? "agent";
|
|
470
|
+
for (const tc of config.cases) {
|
|
471
|
+
const testName = tc.name ?? lastHumanContent(tc.messages);
|
|
472
|
+
const caseToolDefs = tc.tools ?? suiteTools;
|
|
473
|
+
const tools = toMockTools(caseToolDefs);
|
|
474
|
+
const ctx = { message: lastHumanContent(tc.messages) };
|
|
475
|
+
const resolved = tc.expect.map((exp) => exp(ctx));
|
|
476
|
+
const evaluators = resolved.map((r) => r.evaluator);
|
|
477
|
+
const referenceOutputs = Object.assign({}, ...resolved.map((r) => r.referenceOutputs));
|
|
478
|
+
const fullTestName = `[${categoryLabel}] > ${testName}`;
|
|
479
|
+
ls.test(
|
|
480
|
+
fullTestName,
|
|
481
|
+
{
|
|
482
|
+
inputs: {
|
|
483
|
+
name: fullTestName,
|
|
484
|
+
category: categoryLabel,
|
|
485
|
+
model,
|
|
486
|
+
tools: tools.map((t) => t.name).join(" | ") || "none",
|
|
487
|
+
messages: tc.messages
|
|
488
|
+
},
|
|
489
|
+
referenceOutputs
|
|
480
490
|
},
|
|
481
|
-
referenceOutputs
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
491
|
+
async ({ referenceOutputs: refOut }) => {
|
|
492
|
+
let output;
|
|
493
|
+
const prepareMessages = tc.prepareMessages ?? config.prepareMessages ?? getEvalConfig().prepareMessages;
|
|
494
|
+
const preparedMessages = prepareMessages ? await prepareMessages(tc.messages) : tc.messages;
|
|
495
|
+
if (createTarget) {
|
|
496
|
+
output = await runAgentTarget(createTarget, preparedMessages, caseToolDefs);
|
|
497
|
+
} else {
|
|
498
|
+
const target = resolveModelTarget(config);
|
|
499
|
+
const globalPrompt = getEvalConfig().systemPrompt;
|
|
500
|
+
const systemPrompt = tc.systemPrompt ?? config.systemPrompt ?? globalPrompt;
|
|
501
|
+
output = await target({
|
|
502
|
+
messages: preparedMessages,
|
|
503
|
+
tools,
|
|
504
|
+
...systemPrompt ? { systemPrompt } : {}
|
|
505
|
+
});
|
|
506
|
+
}
|
|
507
|
+
const calledTools = output.messages.filter((m) => m instanceof import_messages2.AIMessage).flatMap((m) => m.tool_calls ?? []).map((tc2) => tc2.name);
|
|
508
|
+
ls.logOutputs({
|
|
509
|
+
tools_called: calledTools.length > 0 ? calledTools.join(" | ") : "none"
|
|
497
510
|
});
|
|
511
|
+
for (const evaluator of evaluators) {
|
|
512
|
+
await evaluator({ outputs: output, referenceOutputs: refOut ?? {} });
|
|
513
|
+
}
|
|
498
514
|
}
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
await evaluator({ outputs: output, referenceOutputs: refOut ?? {} });
|
|
502
|
-
}
|
|
503
|
-
}
|
|
504
|
-
);
|
|
515
|
+
);
|
|
516
|
+
}
|
|
505
517
|
}
|
|
506
518
|
});
|
|
507
519
|
}
|
|
@@ -511,7 +523,7 @@ var ls2 = __toESM(require("langsmith/vitest"));
|
|
|
511
523
|
var import_agentevals = require("agentevals");
|
|
512
524
|
|
|
513
525
|
// src/eval/evaluators/language.ts
|
|
514
|
-
var
|
|
526
|
+
var import_messages3 = require("@langchain/core/messages");
|
|
515
527
|
function createLanguageEvaluator(modelConfig, model) {
|
|
516
528
|
const resolver = new LangchainModelResolver(modelConfig);
|
|
517
529
|
const judge = resolver.resolve(model);
|
|
@@ -524,7 +536,7 @@ function createLanguageEvaluator(modelConfig, model) {
|
|
|
524
536
|
return { key: "language_match", score: true, comment: "No expected language specified, skipping" };
|
|
525
537
|
}
|
|
526
538
|
const messages = outputs.messages || [];
|
|
527
|
-
const lastAiMessage = [...messages].reverse().find((m) => m instanceof
|
|
539
|
+
const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages3.AIMessage);
|
|
528
540
|
if (!lastAiMessage) {
|
|
529
541
|
return { key: "language_match", score: false, comment: "No AI message found in trajectory" };
|
|
530
542
|
}
|
|
@@ -550,7 +562,7 @@ function createLanguageEvaluator(modelConfig, model) {
|
|
|
550
562
|
}
|
|
551
563
|
|
|
552
564
|
// src/eval/evaluators/response-content.ts
|
|
553
|
-
var
|
|
565
|
+
var import_messages4 = require("@langchain/core/messages");
|
|
554
566
|
function createResponseContentEvaluator() {
|
|
555
567
|
return async ({
|
|
556
568
|
outputs,
|
|
@@ -562,7 +574,7 @@ function createResponseContentEvaluator() {
|
|
|
562
574
|
return { key: "response_content", score: true, comment: "No content assertions specified, skipping" };
|
|
563
575
|
}
|
|
564
576
|
const messages = outputs.messages || [];
|
|
565
|
-
const lastAiMessage = [...messages].reverse().find((m) => m instanceof
|
|
577
|
+
const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages4.AIMessage);
|
|
566
578
|
if (!lastAiMessage) {
|
|
567
579
|
return { key: "response_content", score: false, comment: "No AI message found in trajectory" };
|
|
568
580
|
}
|
|
@@ -588,7 +600,7 @@ function createResponseContentEvaluator() {
|
|
|
588
600
|
}
|
|
589
601
|
|
|
590
602
|
// src/eval/evaluators/no-tool-calls.ts
|
|
591
|
-
var
|
|
603
|
+
var import_messages5 = require("@langchain/core/messages");
|
|
592
604
|
function createNoToolCallsEvaluator() {
|
|
593
605
|
return async ({
|
|
594
606
|
outputs,
|
|
@@ -598,8 +610,17 @@ function createNoToolCallsEvaluator() {
|
|
|
598
610
|
return { key: "no_tool_calls", score: true, comment: "No tool call restriction specified, skipping" };
|
|
599
611
|
}
|
|
600
612
|
const messages = outputs.messages || [];
|
|
601
|
-
const
|
|
602
|
-
const
|
|
613
|
+
const exceptTools = referenceOutputs?.exceptTools ?? [];
|
|
614
|
+
const toolCalls = messages.filter((m) => m instanceof import_messages5.AIMessage).flatMap((m) => m.tool_calls || []);
|
|
615
|
+
const disallowedCalls = exceptTools.length > 0 ? toolCalls.filter((tc) => !exceptTools.includes(tc.name)) : toolCalls;
|
|
616
|
+
const passed = disallowedCalls.length === 0;
|
|
617
|
+
if (exceptTools.length > 0) {
|
|
618
|
+
return {
|
|
619
|
+
key: "no_tool_calls",
|
|
620
|
+
score: passed,
|
|
621
|
+
comment: passed ? `No disallowed tool calls made (allowed: ${exceptTools.join(", ")})` : `Agent made ${disallowedCalls.length} disallowed tool call(s): ${disallowedCalls.map((tc) => tc.name).join(", ")}`
|
|
622
|
+
};
|
|
623
|
+
}
|
|
603
624
|
return {
|
|
604
625
|
key: "no_tool_calls",
|
|
605
626
|
score: passed,
|
|
@@ -608,6 +629,37 @@ function createNoToolCallsEvaluator() {
|
|
|
608
629
|
};
|
|
609
630
|
}
|
|
610
631
|
|
|
632
|
+
// src/eval/evaluators/any-tool-called.ts
|
|
633
|
+
var import_messages6 = require("@langchain/core/messages");
|
|
634
|
+
function createAnyToolCalledEvaluator() {
|
|
635
|
+
return async ({
|
|
636
|
+
outputs,
|
|
637
|
+
referenceOutputs
|
|
638
|
+
}) => {
|
|
639
|
+
if (referenceOutputs?.expectAnyToolCall !== true) {
|
|
640
|
+
return { key: "any_tool_called", score: true, comment: "No any-tool-call expectation specified, skipping" };
|
|
641
|
+
}
|
|
642
|
+
const expectedTools = referenceOutputs?.anyToolsExpected ?? [];
|
|
643
|
+
const messages = outputs.messages || [];
|
|
644
|
+
const calledToolNames = messages.filter((m) => m instanceof import_messages6.AIMessage).flatMap((m) => m.tool_calls || []).map((tc) => tc.name);
|
|
645
|
+
if (expectedTools.length === 0) {
|
|
646
|
+
const passed2 = calledToolNames.length > 0;
|
|
647
|
+
return {
|
|
648
|
+
key: "any_tool_called",
|
|
649
|
+
score: passed2,
|
|
650
|
+
comment: passed2 ? `Agent called tool(s): ${calledToolNames.join(", ")}` : "Agent made no tool calls (expected at least one)"
|
|
651
|
+
};
|
|
652
|
+
}
|
|
653
|
+
const matchedTools = expectedTools.filter((name) => calledToolNames.includes(name));
|
|
654
|
+
const passed = matchedTools.length > 0;
|
|
655
|
+
return {
|
|
656
|
+
key: "any_tool_called",
|
|
657
|
+
score: passed,
|
|
658
|
+
comment: passed ? `Called expected tool(s): ${matchedTools.join(", ")}` : `None of the expected tools were called (expected one of: ${expectedTools.join(", ")}; actual: ${calledToolNames.length > 0 ? calledToolNames.join(", ") : "none"})`
|
|
659
|
+
};
|
|
660
|
+
};
|
|
661
|
+
}
|
|
662
|
+
|
|
611
663
|
// src/eval/expectations.ts
|
|
612
664
|
function withTrajectoryGuard(evaluator, key) {
|
|
613
665
|
return async ({ outputs, referenceOutputs }) => {
|
|
@@ -659,10 +711,13 @@ function llmJudge() {
|
|
|
659
711
|
};
|
|
660
712
|
};
|
|
661
713
|
}
|
|
662
|
-
function noTools() {
|
|
714
|
+
function noTools(options) {
|
|
663
715
|
return () => ({
|
|
664
716
|
evaluator: ls2.wrapEvaluator(createNoToolCallsEvaluator()),
|
|
665
|
-
referenceOutputs: {
|
|
717
|
+
referenceOutputs: {
|
|
718
|
+
expectNoToolCalls: true,
|
|
719
|
+
...options?.except?.length ? { exceptTools: options.except } : {}
|
|
720
|
+
}
|
|
666
721
|
});
|
|
667
722
|
}
|
|
668
723
|
function respondsInLanguage(code) {
|
|
@@ -675,6 +730,15 @@ function respondsInLanguage(code) {
|
|
|
675
730
|
};
|
|
676
731
|
};
|
|
677
732
|
}
|
|
733
|
+
function anyToolCalled(tools) {
|
|
734
|
+
return () => ({
|
|
735
|
+
evaluator: ls2.wrapEvaluator(createAnyToolCalledEvaluator()),
|
|
736
|
+
referenceOutputs: {
|
|
737
|
+
expectAnyToolCall: true,
|
|
738
|
+
...tools?.length ? { anyToolsExpected: tools } : {}
|
|
739
|
+
}
|
|
740
|
+
});
|
|
741
|
+
}
|
|
678
742
|
function contains(strings) {
|
|
679
743
|
return () => ({
|
|
680
744
|
evaluator: ls2.wrapEvaluator(createResponseContentEvaluator()),
|
|
@@ -690,6 +754,7 @@ function notContains(strings) {
|
|
|
690
754
|
// Annotate the CommonJS export names for ESM import in node:
|
|
691
755
|
0 && (module.exports = {
|
|
692
756
|
ai,
|
|
757
|
+
anyToolCalled,
|
|
693
758
|
configureEvals,
|
|
694
759
|
contains,
|
|
695
760
|
defineSuite,
|
|
@@ -699,6 +764,7 @@ function notContains(strings) {
|
|
|
699
764
|
noTools,
|
|
700
765
|
notContains,
|
|
701
766
|
respondsInLanguage,
|
|
767
|
+
runEvals,
|
|
702
768
|
toolResult,
|
|
703
769
|
toolsCalled
|
|
704
770
|
});
|