@dvina/agents 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, d as AiMessage, g as ToolSpec, H as HumanMessage, f as ToolMessage } from '../model-resolver-BZtVieXE.mjs';
1
+ import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DjKRXKtu.mjs';
2
2
  import * as zod from 'zod';
3
3
  import { z } from 'zod';
4
4
  import { BaseMessage } from '@langchain/core/messages';
@@ -12,6 +12,8 @@ interface EvalConfig {
12
12
  model?: string;
13
13
  /** Model for evaluators needing LLM calls (language detection, LLM-as-judge). */
14
14
  evaluatorModel: string;
15
+ /** LangSmith experiment (dataset) name. All suites share this single experiment for easy comparison across runs. */
16
+ experimentName: string;
15
17
  /** System prompt for model-based target. Ignored when createTarget is used. Can be overridden per-suite or per-case. */
16
18
  systemPrompt?: string;
17
19
  /** Factory that creates a fresh Agent per test case. When set, this is the default target. */
@@ -57,14 +59,26 @@ declare function toolsCalled(tools: string[]): Expectation;
57
59
  * Uses the globally configured evaluator model.
58
60
  */
59
61
  declare function llmJudge(): Expectation;
60
- /** Assert the agent made zero tool calls. */
61
- declare function noTools(): Expectation;
62
+ /**
63
+ * Assert the agent made zero tool calls.
64
+ * Optionally allow specific tools via `except` — calls to those tools
65
+ * are permitted (but not required), while any other tool call fails.
66
+ */
67
+ declare function noTools(options?: {
68
+ except: string[];
69
+ }): Expectation;
62
70
  /**
63
71
  * Assert the response is in the given language (ISO 639-1 code).
64
72
  * Uses the globally configured evaluator model for language detection.
65
73
  * @param code - ISO 639-1 language code (e.g. 'en', 'tr', 'de').
66
74
  */
67
75
  declare function respondsInLanguage(code: string): Expectation;
76
+ /**
77
+ * Assert that at least one tool call was made.
78
+ * When `tools` is provided, at least one of those specific tools must
79
+ * appear in the trajectory. When omitted, any tool call satisfies it.
80
+ */
81
+ declare function anyToolCalled(tools?: string[]): Expectation;
68
82
  /** Assert the response contains all given strings. */
69
83
  declare function contains(strings: string[]): Expectation;
70
84
  /** Assert the response does not contain any of the given strings. */
@@ -118,6 +132,22 @@ interface SuiteConfig {
118
132
  * entry in `responses` default to `''`.
119
133
  */
120
134
  declare function fromToolSpecs(specs: ToolSpec[], responses?: Record<string, ToolDef['response']>): Record<string, ToolDef>;
135
+ /**
136
+ * Registers an eval suite. Does not create tests on its own — call
137
+ * `runEvals()` after all suites are registered to emit a single
138
+ * LangSmith experiment containing every test case.
139
+ */
121
140
  declare function defineSuite(name: string, config: SuiteConfig): void;
141
+ /**
142
+ * Emits all registered suites under a single `ls.describe` block so
143
+ * every test case lands in one LangSmith experiment / dataset.
144
+ *
145
+ * Call this once, after importing all suite files.
146
+ *
147
+ * Individual suites are grouped with native `describe` blocks for
148
+ * readability; test names are prefixed with the suite name
149
+ * (e.g. "discovery > should use search tool").
150
+ */
151
+ declare function runEvals(): void;
122
152
 
123
- export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
153
+ export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };
@@ -1,4 +1,4 @@
1
- import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, d as AiMessage, g as ToolSpec, H as HumanMessage, f as ToolMessage } from '../model-resolver-BZtVieXE.js';
1
+ import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DjKRXKtu.js';
2
2
  import * as zod from 'zod';
3
3
  import { z } from 'zod';
4
4
  import { BaseMessage } from '@langchain/core/messages';
@@ -12,6 +12,8 @@ interface EvalConfig {
12
12
  model?: string;
13
13
  /** Model for evaluators needing LLM calls (language detection, LLM-as-judge). */
14
14
  evaluatorModel: string;
15
+ /** LangSmith experiment (dataset) name. All suites share this single experiment for easy comparison across runs. */
16
+ experimentName: string;
15
17
  /** System prompt for model-based target. Ignored when createTarget is used. Can be overridden per-suite or per-case. */
16
18
  systemPrompt?: string;
17
19
  /** Factory that creates a fresh Agent per test case. When set, this is the default target. */
@@ -57,14 +59,26 @@ declare function toolsCalled(tools: string[]): Expectation;
57
59
  * Uses the globally configured evaluator model.
58
60
  */
59
61
  declare function llmJudge(): Expectation;
60
- /** Assert the agent made zero tool calls. */
61
- declare function noTools(): Expectation;
62
+ /**
63
+ * Assert the agent made zero tool calls.
64
+ * Optionally allow specific tools via `except` — calls to those tools
65
+ * are permitted (but not required), while any other tool call fails.
66
+ */
67
+ declare function noTools(options?: {
68
+ except: string[];
69
+ }): Expectation;
62
70
  /**
63
71
  * Assert the response is in the given language (ISO 639-1 code).
64
72
  * Uses the globally configured evaluator model for language detection.
65
73
  * @param code - ISO 639-1 language code (e.g. 'en', 'tr', 'de').
66
74
  */
67
75
  declare function respondsInLanguage(code: string): Expectation;
76
+ /**
77
+ * Assert that at least one tool call was made.
78
+ * When `tools` is provided, at least one of those specific tools must
79
+ * appear in the trajectory. When omitted, any tool call satisfies it.
80
+ */
81
+ declare function anyToolCalled(tools?: string[]): Expectation;
68
82
  /** Assert the response contains all given strings. */
69
83
  declare function contains(strings: string[]): Expectation;
70
84
  /** Assert the response does not contain any of the given strings. */
@@ -118,6 +132,22 @@ interface SuiteConfig {
118
132
  * entry in `responses` default to `''`.
119
133
  */
120
134
  declare function fromToolSpecs(specs: ToolSpec[], responses?: Record<string, ToolDef['response']>): Record<string, ToolDef>;
135
+ /**
136
+ * Registers an eval suite. Does not create tests on its own — call
137
+ * `runEvals()` after all suites are registered to emit a single
138
+ * LangSmith experiment containing every test case.
139
+ */
121
140
  declare function defineSuite(name: string, config: SuiteConfig): void;
141
+ /**
142
+ * Emits all registered suites under a single `ls.describe` block so
143
+ * every test case lands in one LangSmith experiment / dataset.
144
+ *
145
+ * Call this once, after importing all suite files.
146
+ *
147
+ * Individual suites are grouped with native `describe` blocks for
148
+ * readability; test names are prefixed with the suite name
149
+ * (e.g. "discovery > should use search tool").
150
+ */
151
+ declare function runEvals(): void;
122
152
 
123
- export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
153
+ export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };
@@ -31,6 +31,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
31
31
  var eval_exports = {};
32
32
  __export(eval_exports, {
33
33
  ai: () => ai,
34
+ anyToolCalled: () => anyToolCalled,
34
35
  configureEvals: () => configureEvals,
35
36
  contains: () => contains,
36
37
  defineSuite: () => defineSuite,
@@ -40,6 +41,7 @@ __export(eval_exports, {
40
41
  noTools: () => noTools,
41
42
  notContains: () => notContains,
42
43
  respondsInLanguage: () => respondsInLanguage,
44
+ runEvals: () => runEvals,
43
45
  toolResult: () => toolResult,
44
46
  toolsCalled: () => toolsCalled
45
47
  });
@@ -59,6 +61,7 @@ function getEvalConfig() {
59
61
 
60
62
  // src/eval/suite.ts
61
63
  var ls = __toESM(require("langsmith/vitest"));
64
+ var import_messages2 = require("@langchain/core/messages");
62
65
 
63
66
  // src/eval/target.ts
64
67
  var import_tools = require("@langchain/core/tools");
@@ -430,13 +433,6 @@ function toMockTools(defs) {
430
433
  response: typeof def.response === "function" ? def.response : typeof def.response === "string" ? def.response : JSON.stringify(def.response)
431
434
  }));
432
435
  }
433
- function toSerializableTools(tools) {
434
- return tools.map((t) => ({
435
- ...t,
436
- schema: t.schema instanceof Object && "shape" in t.schema ? "<ZodObject>" : t.schema,
437
- response: typeof t.response === "function" ? "<function>" : t.response
438
- }));
439
- }
440
436
  function lastHumanContent(messages) {
441
437
  for (let i = messages.length - 1; i >= 0; i--) {
442
438
  const msg = messages[i];
@@ -459,49 +455,65 @@ function resolveModelTarget(config) {
459
455
  function resolveCreateTarget(config) {
460
456
  return config.createTarget ?? getEvalConfig().createTarget;
461
457
  }
458
+ var _suites = [];
462
459
  function defineSuite(name, config) {
463
- const suiteTools = config.tools ?? {};
464
- const createTarget = config.target ? void 0 : resolveCreateTarget(config);
465
- ls.describe(name, () => {
466
- for (const tc of config.cases) {
467
- const testName = tc.name ?? lastHumanContent(tc.messages);
468
- const caseToolDefs = tc.tools ?? suiteTools;
469
- const tools = toMockTools(caseToolDefs);
470
- const ctx = { message: lastHumanContent(tc.messages) };
471
- const resolved = tc.expect.map((exp) => exp(ctx));
472
- const evaluators = resolved.map((r) => r.evaluator);
473
- const referenceOutputs = Object.assign({}, ...resolved.map((r) => r.referenceOutputs));
474
- ls.test(
475
- testName,
476
- {
477
- inputs: {
478
- messages: tc.messages,
479
- tools: toSerializableTools(tools)
460
+ _suites.push({ name, config });
461
+ }
462
+ function runEvals() {
463
+ const evalConfig = getEvalConfig();
464
+ ls.describe(evalConfig.experimentName, () => {
465
+ for (const { name: suiteName, config } of _suites) {
466
+ const suiteTools = config.tools ?? {};
467
+ const createTarget = config.target ? void 0 : resolveCreateTarget(config);
468
+ const categoryLabel = suiteName.charAt(0).toUpperCase() + suiteName.slice(1);
469
+ const model = typeof config.target === "string" ? config.target : evalConfig.model ?? "agent";
470
+ for (const tc of config.cases) {
471
+ const testName = tc.name ?? lastHumanContent(tc.messages);
472
+ const caseToolDefs = tc.tools ?? suiteTools;
473
+ const tools = toMockTools(caseToolDefs);
474
+ const ctx = { message: lastHumanContent(tc.messages) };
475
+ const resolved = tc.expect.map((exp) => exp(ctx));
476
+ const evaluators = resolved.map((r) => r.evaluator);
477
+ const referenceOutputs = Object.assign({}, ...resolved.map((r) => r.referenceOutputs));
478
+ const fullTestName = `[${categoryLabel}] > ${testName}`;
479
+ ls.test(
480
+ fullTestName,
481
+ {
482
+ inputs: {
483
+ name: fullTestName,
484
+ category: categoryLabel,
485
+ model,
486
+ tools: tools.map((t) => t.name).join(" | ") || "none",
487
+ messages: tc.messages
488
+ },
489
+ referenceOutputs
480
490
  },
481
- referenceOutputs
482
- },
483
- async ({ referenceOutputs: refOut }) => {
484
- let output;
485
- const prepareMessages = tc.prepareMessages ?? config.prepareMessages ?? getEvalConfig().prepareMessages;
486
- const preparedMessages = prepareMessages ? await prepareMessages(tc.messages) : tc.messages;
487
- if (createTarget) {
488
- output = await runAgentTarget(createTarget, preparedMessages, caseToolDefs);
489
- } else {
490
- const target = resolveModelTarget(config);
491
- const globalPrompt = getEvalConfig().systemPrompt;
492
- const systemPrompt = tc.systemPrompt ?? config.systemPrompt ?? globalPrompt;
493
- output = await target({
494
- messages: preparedMessages,
495
- tools,
496
- ...systemPrompt ? { systemPrompt } : {}
491
+ async ({ referenceOutputs: refOut }) => {
492
+ let output;
493
+ const prepareMessages = tc.prepareMessages ?? config.prepareMessages ?? getEvalConfig().prepareMessages;
494
+ const preparedMessages = prepareMessages ? await prepareMessages(tc.messages) : tc.messages;
495
+ if (createTarget) {
496
+ output = await runAgentTarget(createTarget, preparedMessages, caseToolDefs);
497
+ } else {
498
+ const target = resolveModelTarget(config);
499
+ const globalPrompt = getEvalConfig().systemPrompt;
500
+ const systemPrompt = tc.systemPrompt ?? config.systemPrompt ?? globalPrompt;
501
+ output = await target({
502
+ messages: preparedMessages,
503
+ tools,
504
+ ...systemPrompt ? { systemPrompt } : {}
505
+ });
506
+ }
507
+ const calledTools = output.messages.filter((m) => m instanceof import_messages2.AIMessage).flatMap((m) => m.tool_calls ?? []).map((tc2) => tc2.name);
508
+ ls.logOutputs({
509
+ tools_called: calledTools.length > 0 ? calledTools.join(" | ") : "none"
497
510
  });
511
+ for (const evaluator of evaluators) {
512
+ await evaluator({ outputs: output, referenceOutputs: refOut ?? {} });
513
+ }
498
514
  }
499
- ls.logOutputs(output);
500
- for (const evaluator of evaluators) {
501
- await evaluator({ outputs: output, referenceOutputs: refOut ?? {} });
502
- }
503
- }
504
- );
515
+ );
516
+ }
505
517
  }
506
518
  });
507
519
  }
@@ -511,7 +523,7 @@ var ls2 = __toESM(require("langsmith/vitest"));
511
523
  var import_agentevals = require("agentevals");
512
524
 
513
525
  // src/eval/evaluators/language.ts
514
- var import_messages2 = require("@langchain/core/messages");
526
+ var import_messages3 = require("@langchain/core/messages");
515
527
  function createLanguageEvaluator(modelConfig, model) {
516
528
  const resolver = new LangchainModelResolver(modelConfig);
517
529
  const judge = resolver.resolve(model);
@@ -524,7 +536,7 @@ function createLanguageEvaluator(modelConfig, model) {
524
536
  return { key: "language_match", score: true, comment: "No expected language specified, skipping" };
525
537
  }
526
538
  const messages = outputs.messages || [];
527
- const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages2.AIMessage);
539
+ const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages3.AIMessage);
528
540
  if (!lastAiMessage) {
529
541
  return { key: "language_match", score: false, comment: "No AI message found in trajectory" };
530
542
  }
@@ -550,7 +562,7 @@ function createLanguageEvaluator(modelConfig, model) {
550
562
  }
551
563
 
552
564
  // src/eval/evaluators/response-content.ts
553
- var import_messages3 = require("@langchain/core/messages");
565
+ var import_messages4 = require("@langchain/core/messages");
554
566
  function createResponseContentEvaluator() {
555
567
  return async ({
556
568
  outputs,
@@ -562,7 +574,7 @@ function createResponseContentEvaluator() {
562
574
  return { key: "response_content", score: true, comment: "No content assertions specified, skipping" };
563
575
  }
564
576
  const messages = outputs.messages || [];
565
- const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages3.AIMessage);
577
+ const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages4.AIMessage);
566
578
  if (!lastAiMessage) {
567
579
  return { key: "response_content", score: false, comment: "No AI message found in trajectory" };
568
580
  }
@@ -588,7 +600,7 @@ function createResponseContentEvaluator() {
588
600
  }
589
601
 
590
602
  // src/eval/evaluators/no-tool-calls.ts
591
- var import_messages4 = require("@langchain/core/messages");
603
+ var import_messages5 = require("@langchain/core/messages");
592
604
  function createNoToolCallsEvaluator() {
593
605
  return async ({
594
606
  outputs,
@@ -598,8 +610,17 @@ function createNoToolCallsEvaluator() {
598
610
  return { key: "no_tool_calls", score: true, comment: "No tool call restriction specified, skipping" };
599
611
  }
600
612
  const messages = outputs.messages || [];
601
- const toolCalls = messages.filter((m) => m instanceof import_messages4.AIMessage).flatMap((m) => m.tool_calls || []);
602
- const passed = toolCalls.length === 0;
613
+ const exceptTools = referenceOutputs?.exceptTools ?? [];
614
+ const toolCalls = messages.filter((m) => m instanceof import_messages5.AIMessage).flatMap((m) => m.tool_calls || []);
615
+ const disallowedCalls = exceptTools.length > 0 ? toolCalls.filter((tc) => !exceptTools.includes(tc.name)) : toolCalls;
616
+ const passed = disallowedCalls.length === 0;
617
+ if (exceptTools.length > 0) {
618
+ return {
619
+ key: "no_tool_calls",
620
+ score: passed,
621
+ comment: passed ? `No disallowed tool calls made (allowed: ${exceptTools.join(", ")})` : `Agent made ${disallowedCalls.length} disallowed tool call(s): ${disallowedCalls.map((tc) => tc.name).join(", ")}`
622
+ };
623
+ }
603
624
  return {
604
625
  key: "no_tool_calls",
605
626
  score: passed,
@@ -608,6 +629,37 @@ function createNoToolCallsEvaluator() {
608
629
  };
609
630
  }
610
631
 
632
+ // src/eval/evaluators/any-tool-called.ts
633
+ var import_messages6 = require("@langchain/core/messages");
634
+ function createAnyToolCalledEvaluator() {
635
+ return async ({
636
+ outputs,
637
+ referenceOutputs
638
+ }) => {
639
+ if (referenceOutputs?.expectAnyToolCall !== true) {
640
+ return { key: "any_tool_called", score: true, comment: "No any-tool-call expectation specified, skipping" };
641
+ }
642
+ const expectedTools = referenceOutputs?.anyToolsExpected ?? [];
643
+ const messages = outputs.messages || [];
644
+ const calledToolNames = messages.filter((m) => m instanceof import_messages6.AIMessage).flatMap((m) => m.tool_calls || []).map((tc) => tc.name);
645
+ if (expectedTools.length === 0) {
646
+ const passed2 = calledToolNames.length > 0;
647
+ return {
648
+ key: "any_tool_called",
649
+ score: passed2,
650
+ comment: passed2 ? `Agent called tool(s): ${calledToolNames.join(", ")}` : "Agent made no tool calls (expected at least one)"
651
+ };
652
+ }
653
+ const matchedTools = expectedTools.filter((name) => calledToolNames.includes(name));
654
+ const passed = matchedTools.length > 0;
655
+ return {
656
+ key: "any_tool_called",
657
+ score: passed,
658
+ comment: passed ? `Called expected tool(s): ${matchedTools.join(", ")}` : `None of the expected tools were called (expected one of: ${expectedTools.join(", ")}; actual: ${calledToolNames.length > 0 ? calledToolNames.join(", ") : "none"})`
659
+ };
660
+ };
661
+ }
662
+
611
663
  // src/eval/expectations.ts
612
664
  function withTrajectoryGuard(evaluator, key) {
613
665
  return async ({ outputs, referenceOutputs }) => {
@@ -659,10 +711,13 @@ function llmJudge() {
659
711
  };
660
712
  };
661
713
  }
662
- function noTools() {
714
+ function noTools(options) {
663
715
  return () => ({
664
716
  evaluator: ls2.wrapEvaluator(createNoToolCallsEvaluator()),
665
- referenceOutputs: { expectNoToolCalls: true }
717
+ referenceOutputs: {
718
+ expectNoToolCalls: true,
719
+ ...options?.except?.length ? { exceptTools: options.except } : {}
720
+ }
666
721
  });
667
722
  }
668
723
  function respondsInLanguage(code) {
@@ -675,6 +730,15 @@ function respondsInLanguage(code) {
675
730
  };
676
731
  };
677
732
  }
733
+ function anyToolCalled(tools) {
734
+ return () => ({
735
+ evaluator: ls2.wrapEvaluator(createAnyToolCalledEvaluator()),
736
+ referenceOutputs: {
737
+ expectAnyToolCall: true,
738
+ ...tools?.length ? { anyToolsExpected: tools } : {}
739
+ }
740
+ });
741
+ }
678
742
  function contains(strings) {
679
743
  return () => ({
680
744
  evaluator: ls2.wrapEvaluator(createResponseContentEvaluator()),
@@ -690,6 +754,7 @@ function notContains(strings) {
690
754
  // Annotate the CommonJS export names for ESM import in node:
691
755
  0 && (module.exports = {
692
756
  ai,
757
+ anyToolCalled,
693
758
  configureEvals,
694
759
  contains,
695
760
  defineSuite,
@@ -699,6 +764,7 @@ function notContains(strings) {
699
764
  noTools,
700
765
  notContains,
701
766
  respondsInLanguage,
767
+ runEvals,
702
768
  toolResult,
703
769
  toolsCalled
704
770
  });