@dvina/agents 0.10.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,20 +1,22 @@
1
- import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, d as AiMessage, g as ToolSpec, H as HumanMessage, f as ToolMessage } from '../model-resolver-BZtVieXE.mjs';
1
+ import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DjKRXKtu.mjs';
2
2
  import * as zod from 'zod';
3
3
  import { z } from 'zod';
4
4
  import { BaseMessage } from '@langchain/core/messages';
5
5
 
6
- /** Factory that creates a fresh Agent per test case. Receives extra suite-level tools as ToolDefinition[]. */
7
- type CreateTargetFn = (extraTools: ToolDefinition[]) => Agent | Promise<Agent>;
6
+ /** Factory that creates a fresh Agent per test case. Receives the model string and extra suite-level tools. */
7
+ type CreateTargetFn = (model: string, extraTools: ToolDefinition[]) => Agent | Promise<Agent>;
8
8
  interface EvalConfig {
9
9
  /** Required for model-based target and LLM evaluators (respondsInLanguage, llmJudge). */
10
10
  modelConfig: LangchainModelConfig;
11
- /** Required for model-based target. Also used as fallback for evaluatorModel. */
12
- model?: string;
11
+ /** Models to evaluate. Every registered suite is run once per model. */
12
+ models: string[];
13
13
  /** Model for evaluators needing LLM calls (language detection, LLM-as-judge). */
14
14
  evaluatorModel: string;
15
+ /** LangSmith experiment (dataset) name. All suites share this single experiment for easy comparison across runs. */
16
+ experimentName: string;
15
17
  /** System prompt for model-based target. Ignored when createTarget is used. Can be overridden per-suite or per-case. */
16
18
  systemPrompt?: string;
17
- /** Factory that creates a fresh Agent per test case. When set, this is the default target. */
19
+ /** Factory that creates a fresh Agent per test case. Receives the current model string from the models array. */
18
20
  createTarget?: CreateTargetFn;
19
21
  /** Transforms test case messages before sending to target. Simulates production preprocessing (e.g., message enrichment). */
20
22
  prepareMessages?: (messages: Message[]) => Message[] | Promise<Message[]>;
@@ -57,14 +59,26 @@ declare function toolsCalled(tools: string[]): Expectation;
57
59
  * Uses the globally configured evaluator model.
58
60
  */
59
61
  declare function llmJudge(): Expectation;
60
- /** Assert the agent made zero tool calls. */
61
- declare function noTools(): Expectation;
62
+ /**
63
+ * Assert the agent made zero tool calls.
64
+ * Optionally allow specific tools via `except` — calls to those tools
65
+ * are permitted (but not required), while any other tool call fails.
66
+ */
67
+ declare function noTools(options?: {
68
+ except: string[];
69
+ }): Expectation;
62
70
  /**
63
71
  * Assert the response is in the given language (ISO 639-1 code).
64
72
  * Uses the globally configured evaluator model for language detection.
65
73
  * @param code - ISO 639-1 language code (e.g. 'en', 'tr', 'de').
66
74
  */
67
75
  declare function respondsInLanguage(code: string): Expectation;
76
+ /**
77
+ * Assert that at least one tool call was made.
78
+ * When `tools` is provided, at least one of those specific tools must
79
+ * appear in the trajectory. When omitted, any tool call satisfies it.
80
+ */
81
+ declare function anyToolCalled(tools?: string[]): Expectation;
68
82
  /** Assert the response contains all given strings. */
69
83
  declare function contains(strings: string[]): Expectation;
70
84
  /** Assert the response does not contain any of the given strings. */
@@ -118,6 +132,22 @@ interface SuiteConfig {
118
132
  * entry in `responses` default to `''`.
119
133
  */
120
134
  declare function fromToolSpecs(specs: ToolSpec[], responses?: Record<string, ToolDef['response']>): Record<string, ToolDef>;
135
+ /**
136
+ * Registers an eval suite. Does not create tests on its own — call
137
+ * `runEvals()` after all suites are registered to emit a single
138
+ * LangSmith experiment containing every test case.
139
+ */
121
140
  declare function defineSuite(name: string, config: SuiteConfig): void;
141
+ /**
142
+ * Emits all registered suites under a single `ls.describe` block so
143
+ * every test case lands in one LangSmith experiment / dataset.
144
+ *
145
+ * Call this once, after importing all suite files.
146
+ *
147
+ * Individual suites are grouped with native `describe` blocks for
148
+ * readability; test names are prefixed with the suite name
149
+ * (e.g. "discovery > should use search tool").
150
+ */
151
+ declare function runEvals(): void;
122
152
 
123
- export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
153
+ export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };
@@ -1,20 +1,22 @@
1
- import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, d as AiMessage, g as ToolSpec, H as HumanMessage, f as ToolMessage } from '../model-resolver-BZtVieXE.js';
1
+ import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DjKRXKtu.js';
2
2
  import * as zod from 'zod';
3
3
  import { z } from 'zod';
4
4
  import { BaseMessage } from '@langchain/core/messages';
5
5
 
6
- /** Factory that creates a fresh Agent per test case. Receives extra suite-level tools as ToolDefinition[]. */
7
- type CreateTargetFn = (extraTools: ToolDefinition[]) => Agent | Promise<Agent>;
6
+ /** Factory that creates a fresh Agent per test case. Receives the model string and extra suite-level tools. */
7
+ type CreateTargetFn = (model: string, extraTools: ToolDefinition[]) => Agent | Promise<Agent>;
8
8
  interface EvalConfig {
9
9
  /** Required for model-based target and LLM evaluators (respondsInLanguage, llmJudge). */
10
10
  modelConfig: LangchainModelConfig;
11
- /** Required for model-based target. Also used as fallback for evaluatorModel. */
12
- model?: string;
11
+ /** Models to evaluate. Every registered suite is run once per model. */
12
+ models: string[];
13
13
  /** Model for evaluators needing LLM calls (language detection, LLM-as-judge). */
14
14
  evaluatorModel: string;
15
+ /** LangSmith experiment (dataset) name. All suites share this single experiment for easy comparison across runs. */
16
+ experimentName: string;
15
17
  /** System prompt for model-based target. Ignored when createTarget is used. Can be overridden per-suite or per-case. */
16
18
  systemPrompt?: string;
17
- /** Factory that creates a fresh Agent per test case. When set, this is the default target. */
19
+ /** Factory that creates a fresh Agent per test case. Receives the current model string from the models array. */
18
20
  createTarget?: CreateTargetFn;
19
21
  /** Transforms test case messages before sending to target. Simulates production preprocessing (e.g., message enrichment). */
20
22
  prepareMessages?: (messages: Message[]) => Message[] | Promise<Message[]>;
@@ -57,14 +59,26 @@ declare function toolsCalled(tools: string[]): Expectation;
57
59
  * Uses the globally configured evaluator model.
58
60
  */
59
61
  declare function llmJudge(): Expectation;
60
- /** Assert the agent made zero tool calls. */
61
- declare function noTools(): Expectation;
62
+ /**
63
+ * Assert the agent made zero tool calls.
64
+ * Optionally allow specific tools via `except` — calls to those tools
65
+ * are permitted (but not required), while any other tool call fails.
66
+ */
67
+ declare function noTools(options?: {
68
+ except: string[];
69
+ }): Expectation;
62
70
  /**
63
71
  * Assert the response is in the given language (ISO 639-1 code).
64
72
  * Uses the globally configured evaluator model for language detection.
65
73
  * @param code - ISO 639-1 language code (e.g. 'en', 'tr', 'de').
66
74
  */
67
75
  declare function respondsInLanguage(code: string): Expectation;
76
+ /**
77
+ * Assert that at least one tool call was made.
78
+ * When `tools` is provided, at least one of those specific tools must
79
+ * appear in the trajectory. When omitted, any tool call satisfies it.
80
+ */
81
+ declare function anyToolCalled(tools?: string[]): Expectation;
68
82
  /** Assert the response contains all given strings. */
69
83
  declare function contains(strings: string[]): Expectation;
70
84
  /** Assert the response does not contain any of the given strings. */
@@ -118,6 +132,22 @@ interface SuiteConfig {
118
132
  * entry in `responses` default to `''`.
119
133
  */
120
134
  declare function fromToolSpecs(specs: ToolSpec[], responses?: Record<string, ToolDef['response']>): Record<string, ToolDef>;
135
+ /**
136
+ * Registers an eval suite. Does not create tests on its own — call
137
+ * `runEvals()` after all suites are registered to emit a single
138
+ * LangSmith experiment containing every test case.
139
+ */
121
140
  declare function defineSuite(name: string, config: SuiteConfig): void;
141
+ /**
142
+ * Emits all registered suites under a single `ls.describe` block so
143
+ * every test case lands in one LangSmith experiment / dataset.
144
+ *
145
+ * Call this once, after importing all suite files.
146
+ *
147
+ * Individual suites are grouped with native `describe` blocks for
148
+ * readability; test names are prefixed with the suite name
149
+ * (e.g. "discovery > should use search tool").
150
+ */
151
+ declare function runEvals(): void;
122
152
 
123
- export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
153
+ export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };
@@ -31,6 +31,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
31
31
  var eval_exports = {};
32
32
  __export(eval_exports, {
33
33
  ai: () => ai,
34
+ anyToolCalled: () => anyToolCalled,
34
35
  configureEvals: () => configureEvals,
35
36
  contains: () => contains,
36
37
  defineSuite: () => defineSuite,
@@ -40,6 +41,7 @@ __export(eval_exports, {
40
41
  noTools: () => noTools,
41
42
  notContains: () => notContains,
42
43
  respondsInLanguage: () => respondsInLanguage,
44
+ runEvals: () => runEvals,
43
45
  toolResult: () => toolResult,
44
46
  toolsCalled: () => toolsCalled
45
47
  });
@@ -59,6 +61,7 @@ function getEvalConfig() {
59
61
 
60
62
  // src/eval/suite.ts
61
63
  var ls = __toESM(require("langsmith/vitest"));
64
+ var import_messages2 = require("@langchain/core/messages");
62
65
 
63
66
  // src/eval/target.ts
64
67
  var import_tools = require("@langchain/core/tools");
@@ -246,14 +249,24 @@ function convertToLangchainMessages(messages) {
246
249
 
247
250
  // src/eval/target.ts
248
251
  var MAX_AGENT_LOOPS = 10;
252
+ function stripReasoningBlocks(message) {
253
+ if (!Array.isArray(message.content)) return message;
254
+ const filtered = message.content.filter(
255
+ (block) => block.type !== "reasoning" && block.type !== "thinking"
256
+ );
257
+ const newContent = filtered.length > 0 ? filtered : "";
258
+ return new import_messages.AIMessage({
259
+ content: newContent,
260
+ tool_calls: message.tool_calls,
261
+ id: message.id,
262
+ response_metadata: message.response_metadata,
263
+ usage_metadata: message.usage_metadata
264
+ });
265
+ }
249
266
  function createEvalTarget(modelConfig, modelString) {
250
267
  return async (inputs) => {
251
- const config = modelConfig && modelString ? { modelConfig, model: modelString } : getEvalConfig();
252
- if (!config.model) {
253
- throw new Error("model is required for model-based target. Add it to your configureEvals() call.");
254
- }
255
- const resolver = new LangchainModelResolver(config.modelConfig);
256
- const model = resolver.resolve(config.model);
268
+ const resolver = new LangchainModelResolver(modelConfig);
269
+ const model = resolver.resolve(modelString);
257
270
  const toolCallCounts = {};
258
271
  const langchainTools = inputs.tools.map((mockTool) => {
259
272
  toolCallCounts[mockTool.name] = 0;
@@ -290,7 +303,7 @@ function createEvalTarget(modelConfig, modelString) {
290
303
  while (loopCount < MAX_AGENT_LOOPS) {
291
304
  loopCount++;
292
305
  const response = await boundModel.invoke(messages);
293
- messages.push(response);
306
+ messages.push(stripReasoningBlocks(response));
294
307
  const aiMessage = response;
295
308
  if (!aiMessage.tool_calls || aiMessage.tool_calls.length === 0) {
296
309
  break;
@@ -390,9 +403,9 @@ function toolDefsToDefinitions(defs) {
390
403
  };
391
404
  });
392
405
  }
393
- async function runAgentTarget(createTarget, evalMessages, extraToolDefs) {
406
+ async function runAgentTarget(createTarget, model, evalMessages, extraToolDefs) {
394
407
  const extraTools = Object.keys(extraToolDefs).length > 0 ? toolDefsToDefinitions(extraToolDefs) : [];
395
- const agent = await createTarget(extraTools);
408
+ const agent = await createTarget(model, extraTools);
396
409
  const result = await agent.run({
397
410
  threadId: `eval_${Date.now()}_${Math.random().toString(36).slice(2)}`,
398
411
  messages: evalMessages
@@ -430,13 +443,6 @@ function toMockTools(defs) {
430
443
  response: typeof def.response === "function" ? def.response : typeof def.response === "string" ? def.response : JSON.stringify(def.response)
431
444
  }));
432
445
  }
433
- function toSerializableTools(tools) {
434
- return tools.map((t) => ({
435
- ...t,
436
- schema: t.schema instanceof Object && "shape" in t.schema ? "<ZodObject>" : t.schema,
437
- response: typeof t.response === "function" ? "<function>" : t.response
438
- }));
439
- }
440
446
  function lastHumanContent(messages) {
441
447
  for (let i = messages.length - 1; i >= 0; i--) {
442
448
  const msg = messages[i];
@@ -447,61 +453,80 @@ function lastHumanContent(messages) {
447
453
  }
448
454
  return "";
449
455
  }
450
- function resolveModelTarget(config) {
456
+ function resolveModelTarget(config, model) {
451
457
  if (typeof config.target === "function") return config.target;
452
458
  const evalConfig = getEvalConfig();
453
- if (!evalConfig.model && typeof config.target !== "string") {
454
- throw new Error("model is required for model-based target. Add it to your configureEvals() call.");
455
- }
456
- const model = typeof config.target === "string" ? config.target : evalConfig.model;
457
- return createEvalTarget(evalConfig.modelConfig, model);
459
+ const targetModel = typeof config.target === "string" ? config.target : model;
460
+ return createEvalTarget(evalConfig.modelConfig, targetModel);
458
461
  }
459
462
  function resolveCreateTarget(config) {
460
463
  return config.createTarget ?? getEvalConfig().createTarget;
461
464
  }
465
+ var _suites = [];
462
466
  function defineSuite(name, config) {
463
- const suiteTools = config.tools ?? {};
464
- const createTarget = config.target ? void 0 : resolveCreateTarget(config);
465
- ls.describe(name, () => {
466
- for (const tc of config.cases) {
467
- const testName = tc.name ?? lastHumanContent(tc.messages);
468
- const caseToolDefs = tc.tools ?? suiteTools;
469
- const tools = toMockTools(caseToolDefs);
470
- const ctx = { message: lastHumanContent(tc.messages) };
471
- const resolved = tc.expect.map((exp) => exp(ctx));
472
- const evaluators = resolved.map((r) => r.evaluator);
473
- const referenceOutputs = Object.assign({}, ...resolved.map((r) => r.referenceOutputs));
474
- ls.test(
475
- testName,
476
- {
477
- inputs: {
478
- messages: tc.messages,
479
- tools: toSerializableTools(tools)
480
- },
481
- referenceOutputs
482
- },
483
- async ({ referenceOutputs: refOut }) => {
484
- let output;
485
- const prepareMessages = tc.prepareMessages ?? config.prepareMessages ?? getEvalConfig().prepareMessages;
486
- const preparedMessages = prepareMessages ? await prepareMessages(tc.messages) : tc.messages;
487
- if (createTarget) {
488
- output = await runAgentTarget(createTarget, preparedMessages, caseToolDefs);
489
- } else {
490
- const target = resolveModelTarget(config);
491
- const globalPrompt = getEvalConfig().systemPrompt;
492
- const systemPrompt = tc.systemPrompt ?? config.systemPrompt ?? globalPrompt;
493
- output = await target({
494
- messages: preparedMessages,
495
- tools,
496
- ...systemPrompt ? { systemPrompt } : {}
497
- });
498
- }
499
- ls.logOutputs(output);
500
- for (const evaluator of evaluators) {
501
- await evaluator({ outputs: output, referenceOutputs: refOut ?? {} });
502
- }
467
+ _suites.push({ name, config });
468
+ }
469
+ function runEvals() {
470
+ const evalConfig = getEvalConfig();
471
+ ls.describe(evalConfig.experimentName, () => {
472
+ for (const currentModel of evalConfig.models) {
473
+ for (const { name: suiteName, config } of _suites) {
474
+ const suiteTools = config.tools ?? {};
475
+ const createTarget = config.target ? void 0 : resolveCreateTarget(config);
476
+ const categoryLabel = suiteName.charAt(0).toUpperCase() + suiteName.slice(1);
477
+ const model = typeof config.target === "string" ? config.target : currentModel;
478
+ for (const tc of config.cases) {
479
+ const testName = tc.name ?? lastHumanContent(tc.messages);
480
+ const caseToolDefs = tc.tools ?? suiteTools;
481
+ const tools = toMockTools(caseToolDefs);
482
+ const ctx = { message: lastHumanContent(tc.messages) };
483
+ const resolved = tc.expect.map((exp) => exp(ctx));
484
+ const evaluators = resolved.map((r) => r.evaluator);
485
+ const referenceOutputs = Object.assign({}, ...resolved.map((r) => r.referenceOutputs));
486
+ const fullTestName = `[${categoryLabel}] > ${testName}`;
487
+ ls.test(
488
+ `${fullTestName} (${model})`,
489
+ {
490
+ inputs: {
491
+ name: fullTestName,
492
+ category: categoryLabel,
493
+ model,
494
+ messages: tc.messages
495
+ },
496
+ referenceOutputs
497
+ },
498
+ async ({ referenceOutputs: refOut }) => {
499
+ let output;
500
+ const prepareMessages = tc.prepareMessages ?? config.prepareMessages ?? getEvalConfig().prepareMessages;
501
+ const preparedMessages = prepareMessages ? await prepareMessages(tc.messages) : tc.messages;
502
+ if (createTarget) {
503
+ output = await runAgentTarget(
504
+ createTarget,
505
+ currentModel,
506
+ preparedMessages,
507
+ caseToolDefs
508
+ );
509
+ } else {
510
+ const target = resolveModelTarget(config, currentModel);
511
+ const globalPrompt = getEvalConfig().systemPrompt;
512
+ const systemPrompt = tc.systemPrompt ?? config.systemPrompt ?? globalPrompt;
513
+ output = await target({
514
+ messages: preparedMessages,
515
+ tools,
516
+ ...systemPrompt ? { systemPrompt } : {}
517
+ });
518
+ }
519
+ const calledTools = output.messages.filter((m) => m instanceof import_messages2.AIMessage).flatMap((m) => m.tool_calls ?? []).map((tc2) => tc2.name);
520
+ ls.logOutputs({
521
+ tools_called: calledTools.length > 0 ? calledTools.join(" | ") : "none"
522
+ });
523
+ for (const evaluator of evaluators) {
524
+ await evaluator({ outputs: output, referenceOutputs: refOut ?? {} });
525
+ }
526
+ }
527
+ );
503
528
  }
504
- );
529
+ }
505
530
  }
506
531
  });
507
532
  }
@@ -511,7 +536,7 @@ var ls2 = __toESM(require("langsmith/vitest"));
511
536
  var import_agentevals = require("agentevals");
512
537
 
513
538
  // src/eval/evaluators/language.ts
514
- var import_messages2 = require("@langchain/core/messages");
539
+ var import_messages3 = require("@langchain/core/messages");
515
540
  function createLanguageEvaluator(modelConfig, model) {
516
541
  const resolver = new LangchainModelResolver(modelConfig);
517
542
  const judge = resolver.resolve(model);
@@ -524,7 +549,7 @@ function createLanguageEvaluator(modelConfig, model) {
524
549
  return { key: "language_match", score: true, comment: "No expected language specified, skipping" };
525
550
  }
526
551
  const messages = outputs.messages || [];
527
- const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages2.AIMessage);
552
+ const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages3.AIMessage);
528
553
  if (!lastAiMessage) {
529
554
  return { key: "language_match", score: false, comment: "No AI message found in trajectory" };
530
555
  }
@@ -550,7 +575,7 @@ function createLanguageEvaluator(modelConfig, model) {
550
575
  }
551
576
 
552
577
  // src/eval/evaluators/response-content.ts
553
- var import_messages3 = require("@langchain/core/messages");
578
+ var import_messages4 = require("@langchain/core/messages");
554
579
  function createResponseContentEvaluator() {
555
580
  return async ({
556
581
  outputs,
@@ -562,7 +587,7 @@ function createResponseContentEvaluator() {
562
587
  return { key: "response_content", score: true, comment: "No content assertions specified, skipping" };
563
588
  }
564
589
  const messages = outputs.messages || [];
565
- const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages3.AIMessage);
590
+ const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages4.AIMessage);
566
591
  if (!lastAiMessage) {
567
592
  return { key: "response_content", score: false, comment: "No AI message found in trajectory" };
568
593
  }
@@ -588,7 +613,7 @@ function createResponseContentEvaluator() {
588
613
  }
589
614
 
590
615
  // src/eval/evaluators/no-tool-calls.ts
591
- var import_messages4 = require("@langchain/core/messages");
616
+ var import_messages5 = require("@langchain/core/messages");
592
617
  function createNoToolCallsEvaluator() {
593
618
  return async ({
594
619
  outputs,
@@ -598,8 +623,17 @@ function createNoToolCallsEvaluator() {
598
623
  return { key: "no_tool_calls", score: true, comment: "No tool call restriction specified, skipping" };
599
624
  }
600
625
  const messages = outputs.messages || [];
601
- const toolCalls = messages.filter((m) => m instanceof import_messages4.AIMessage).flatMap((m) => m.tool_calls || []);
602
- const passed = toolCalls.length === 0;
626
+ const exceptTools = referenceOutputs?.exceptTools ?? [];
627
+ const toolCalls = messages.filter((m) => m instanceof import_messages5.AIMessage).flatMap((m) => m.tool_calls || []);
628
+ const disallowedCalls = exceptTools.length > 0 ? toolCalls.filter((tc) => !exceptTools.includes(tc.name)) : toolCalls;
629
+ const passed = disallowedCalls.length === 0;
630
+ if (exceptTools.length > 0) {
631
+ return {
632
+ key: "no_tool_calls",
633
+ score: passed,
634
+ comment: passed ? `No disallowed tool calls made (allowed: ${exceptTools.join(", ")})` : `Agent made ${disallowedCalls.length} disallowed tool call(s): ${disallowedCalls.map((tc) => tc.name).join(", ")}`
635
+ };
636
+ }
603
637
  return {
604
638
  key: "no_tool_calls",
605
639
  score: passed,
@@ -608,6 +642,37 @@ function createNoToolCallsEvaluator() {
608
642
  };
609
643
  }
610
644
 
645
+ // src/eval/evaluators/any-tool-called.ts
646
+ var import_messages6 = require("@langchain/core/messages");
647
+ function createAnyToolCalledEvaluator() {
648
+ return async ({
649
+ outputs,
650
+ referenceOutputs
651
+ }) => {
652
+ if (referenceOutputs?.expectAnyToolCall !== true) {
653
+ return { key: "any_tool_called", score: true, comment: "No any-tool-call expectation specified, skipping" };
654
+ }
655
+ const expectedTools = referenceOutputs?.anyToolsExpected ?? [];
656
+ const messages = outputs.messages || [];
657
+ const calledToolNames = messages.filter((m) => m instanceof import_messages6.AIMessage).flatMap((m) => m.tool_calls || []).map((tc) => tc.name);
658
+ if (expectedTools.length === 0) {
659
+ const passed2 = calledToolNames.length > 0;
660
+ return {
661
+ key: "any_tool_called",
662
+ score: passed2,
663
+ comment: passed2 ? `Agent called tool(s): ${calledToolNames.join(", ")}` : "Agent made no tool calls (expected at least one)"
664
+ };
665
+ }
666
+ const matchedTools = expectedTools.filter((name) => calledToolNames.includes(name));
667
+ const passed = matchedTools.length > 0;
668
+ return {
669
+ key: "any_tool_called",
670
+ score: passed,
671
+ comment: passed ? `Called expected tool(s): ${matchedTools.join(", ")}` : `None of the expected tools were called (expected one of: ${expectedTools.join(", ")}; actual: ${calledToolNames.length > 0 ? calledToolNames.join(", ") : "none"})`
672
+ };
673
+ };
674
+ }
675
+
611
676
  // src/eval/expectations.ts
612
677
  function withTrajectoryGuard(evaluator, key) {
613
678
  return async ({ outputs, referenceOutputs }) => {
@@ -659,10 +724,13 @@ function llmJudge() {
659
724
  };
660
725
  };
661
726
  }
662
- function noTools() {
727
+ function noTools(options) {
663
728
  return () => ({
664
729
  evaluator: ls2.wrapEvaluator(createNoToolCallsEvaluator()),
665
- referenceOutputs: { expectNoToolCalls: true }
730
+ referenceOutputs: {
731
+ expectNoToolCalls: true,
732
+ ...options?.except?.length ? { exceptTools: options.except } : {}
733
+ }
666
734
  });
667
735
  }
668
736
  function respondsInLanguage(code) {
@@ -675,6 +743,15 @@ function respondsInLanguage(code) {
675
743
  };
676
744
  };
677
745
  }
746
+ function anyToolCalled(tools) {
747
+ return () => ({
748
+ evaluator: ls2.wrapEvaluator(createAnyToolCalledEvaluator()),
749
+ referenceOutputs: {
750
+ expectAnyToolCall: true,
751
+ ...tools?.length ? { anyToolsExpected: tools } : {}
752
+ }
753
+ });
754
+ }
678
755
  function contains(strings) {
679
756
  return () => ({
680
757
  evaluator: ls2.wrapEvaluator(createResponseContentEvaluator()),
@@ -690,6 +767,7 @@ function notContains(strings) {
690
767
  // Annotate the CommonJS export names for ESM import in node:
691
768
  0 && (module.exports = {
692
769
  ai,
770
+ anyToolCalled,
693
771
  configureEvals,
694
772
  contains,
695
773
  defineSuite,
@@ -699,6 +777,7 @@ function notContains(strings) {
699
777
  noTools,
700
778
  notContains,
701
779
  respondsInLanguage,
780
+ runEvals,
702
781
  toolResult,
703
782
  toolsCalled
704
783
  });