npm - @dvina/agents - Versions diffs - 0.10.0 → 0.12.0 - Mend

@dvina/agents 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/eval/index.d.mts +34 -4
package/dist/eval/index.d.ts +34 -4
package/dist/eval/index.js +121 -55
package/dist/eval/index.js.map +1 -1
package/dist/eval/index.mjs +120 -60
package/dist/eval/index.mjs.map +1 -1
package/dist/index.d.mts +5 -4
package/dist/index.d.ts +5 -4
package/dist/index.js +16 -1
package/dist/index.js.map +1 -1
package/dist/index.mjs +16 -1
package/dist/index.mjs.map +1 -1
package/dist/{model-resolver-BZtVieXE.d.mts → model-resolver-DjKRXKtu.d.mts} +1 -1
package/dist/{model-resolver-BZtVieXE.d.ts → model-resolver-DjKRXKtu.d.ts} +1 -1
package/package.json +1 -1

package/dist/eval/index.d.mts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, d as AiMessage, g as ToolSpec, H as HumanMessage, f as ToolMessage } from '../model-resolver-BZtVieXE.mjs';
+import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DjKRXKtu.mjs';
 import * as zod from 'zod';
 import { z } from 'zod';
 import { BaseMessage } from '@langchain/core/messages';
@@ -12,6 +12,8 @@ interface EvalConfig {
     model?: string;
     /** Model for evaluators needing LLM calls (language detection, LLM-as-judge). */
     evaluatorModel: string;
+    /** LangSmith experiment (dataset) name. All suites share this single experiment for easy comparison across runs. */
+    experimentName: string;
     /** System prompt for model-based target. Ignored when createTarget is used. Can be overridden per-suite or per-case. */
     systemPrompt?: string;
     /** Factory that creates a fresh Agent per test case. When set, this is the default target. */
@@ -57,14 +59,26 @@ declare function toolsCalled(tools: string[]): Expectation;
  * Uses the globally configured evaluator model.
  */
 declare function llmJudge(): Expectation;
-/** Assert the agent made zero tool calls. */
-declare function noTools(): Expectation;
+/**
+ * Assert the agent made zero tool calls.
+ * Optionally allow specific tools via `except` — calls to those tools
+ * are permitted (but not required), while any other tool call fails.
+ */
+declare function noTools(options?: {
+    except: string[];
+}): Expectation;
 /**
  * Assert the response is in the given language (ISO 639-1 code).
  * Uses the globally configured evaluator model for language detection.
  * @param code - ISO 639-1 language code (e.g. 'en', 'tr', 'de').
  */
 declare function respondsInLanguage(code: string): Expectation;
+/**
+ * Assert that at least one tool call was made.
+ * When `tools` is provided, at least one of those specific tools must
+ * appear in the trajectory. When omitted, any tool call satisfies it.
+ */
+declare function anyToolCalled(tools?: string[]): Expectation;
 /** Assert the response contains all given strings. */
 declare function contains(strings: string[]): Expectation;
 /** Assert the response does not contain any of the given strings. */
@@ -118,6 +132,22 @@ interface SuiteConfig {
  * entry in `responses` default to `''`.
  */
 declare function fromToolSpecs(specs: ToolSpec[], responses?: Record<string, ToolDef['response']>): Record<string, ToolDef>;
+/**
+ * Registers an eval suite. Does not create tests on its own — call
+ * `runEvals()` after all suites are registered to emit a single
+ * LangSmith experiment containing every test case.
+ */
 declare function defineSuite(name: string, config: SuiteConfig): void;
+/**
+ * Emits all registered suites under a single `ls.describe` block so
+ * every test case lands in one LangSmith experiment / dataset.
+ *
+ * Call this once, after importing all suite files.
+ *
+ * Individual suites are grouped with native `describe` blocks for
+ * readability; test names are prefixed with the suite name
+ * (e.g. "discovery > should use search tool").
+ */
+declare function runEvals(): void;
-export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
+export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };

package/dist/eval/index.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, d as AiMessage, g as ToolSpec, H as HumanMessage, f as ToolMessage } from '../model-resolver-BZtVieXE.js';
+import { T as ToolDefinition, A as Agent, L as LangchainModelConfig, M as Message, e as AiMessage, b as ToolSpec, H as HumanMessage, g as ToolMessage } from '../model-resolver-DjKRXKtu.js';
 import * as zod from 'zod';
 import { z } from 'zod';
 import { BaseMessage } from '@langchain/core/messages';
@@ -12,6 +12,8 @@ interface EvalConfig {
     model?: string;
     /** Model for evaluators needing LLM calls (language detection, LLM-as-judge). */
     evaluatorModel: string;
+    /** LangSmith experiment (dataset) name. All suites share this single experiment for easy comparison across runs. */
+    experimentName: string;
     /** System prompt for model-based target. Ignored when createTarget is used. Can be overridden per-suite or per-case. */
     systemPrompt?: string;
     /** Factory that creates a fresh Agent per test case. When set, this is the default target. */
@@ -57,14 +59,26 @@ declare function toolsCalled(tools: string[]): Expectation;
  * Uses the globally configured evaluator model.
  */
 declare function llmJudge(): Expectation;
-/** Assert the agent made zero tool calls. */
-declare function noTools(): Expectation;
+/**
+ * Assert the agent made zero tool calls.
+ * Optionally allow specific tools via `except` — calls to those tools
+ * are permitted (but not required), while any other tool call fails.
+ */
+declare function noTools(options?: {
+    except: string[];
+}): Expectation;
 /**
  * Assert the response is in the given language (ISO 639-1 code).
  * Uses the globally configured evaluator model for language detection.
  * @param code - ISO 639-1 language code (e.g. 'en', 'tr', 'de').
  */
 declare function respondsInLanguage(code: string): Expectation;
+/**
+ * Assert that at least one tool call was made.
+ * When `tools` is provided, at least one of those specific tools must
+ * appear in the trajectory. When omitted, any tool call satisfies it.
+ */
+declare function anyToolCalled(tools?: string[]): Expectation;
 /** Assert the response contains all given strings. */
 declare function contains(strings: string[]): Expectation;
 /** Assert the response does not contain any of the given strings. */
@@ -118,6 +132,22 @@ interface SuiteConfig {
  * entry in `responses` default to `''`.
  */
 declare function fromToolSpecs(specs: ToolSpec[], responses?: Record<string, ToolDef['response']>): Record<string, ToolDef>;
+/**
+ * Registers an eval suite. Does not create tests on its own — call
+ * `runEvals()` after all suites are registered to emit a single
+ * LangSmith experiment containing every test case.
+ */
 declare function defineSuite(name: string, config: SuiteConfig): void;
+/**
+ * Emits all registered suites under a single `ls.describe` block so
+ * every test case lands in one LangSmith experiment / dataset.
+ *
+ * Call this once, after importing all suite files.
+ *
+ * Individual suites are grouped with native `describe` blocks for
+ * readability; test names are prefixed with the suite name
+ * (e.g. "discovery > should use search tool").
+ */
+declare function runEvals(): void;
-export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, toolResult, toolsCalled };
+export { type CreateTargetFn, type EvalConfig, type Expectation, type SuiteConfig, type TestCase, type ToolDef, ai, anyToolCalled, configureEvals, contains, defineSuite, fromToolSpecs, human, llmJudge, noTools, notContains, respondsInLanguage, runEvals, toolResult, toolsCalled };

package/dist/eval/index.js CHANGED Viewed

@@ -31,6 +31,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
 var eval_exports = {};
 __export(eval_exports, {
   ai: () => ai,
+  anyToolCalled: () => anyToolCalled,
   configureEvals: () => configureEvals,
   contains: () => contains,
   defineSuite: () => defineSuite,
@@ -40,6 +41,7 @@ __export(eval_exports, {
   noTools: () => noTools,
   notContains: () => notContains,
   respondsInLanguage: () => respondsInLanguage,
+  runEvals: () => runEvals,
   toolResult: () => toolResult,
   toolsCalled: () => toolsCalled
 });
@@ -59,6 +61,7 @@ function getEvalConfig() {
 // src/eval/suite.ts
 var ls = __toESM(require("langsmith/vitest"));
+var import_messages2 = require("@langchain/core/messages");
 // src/eval/target.ts
 var import_tools = require("@langchain/core/tools");
@@ -430,13 +433,6 @@ function toMockTools(defs) {
     response: typeof def.response === "function" ? def.response : typeof def.response === "string" ? def.response : JSON.stringify(def.response)
   }));
 }
-function toSerializableTools(tools) {
-  return tools.map((t) => ({
-    ...t,
-    schema: t.schema instanceof Object && "shape" in t.schema ? "<ZodObject>" : t.schema,
-    response: typeof t.response === "function" ? "<function>" : t.response
-  }));
-}
 function lastHumanContent(messages) {
   for (let i = messages.length - 1; i >= 0; i--) {
     const msg = messages[i];
@@ -459,49 +455,65 @@ function resolveModelTarget(config) {
 function resolveCreateTarget(config) {
   return config.createTarget ?? getEvalConfig().createTarget;
 }
+var _suites = [];
 function defineSuite(name, config) {
-  const suiteTools = config.tools ?? {};
-  const createTarget = config.target ? void 0 : resolveCreateTarget(config);
-  ls.describe(name, () => {
-    for (const tc of config.cases) {
-      const testName = tc.name ?? lastHumanContent(tc.messages);
-      const caseToolDefs = tc.tools ?? suiteTools;
-      const tools = toMockTools(caseToolDefs);
-      const ctx = { message: lastHumanContent(tc.messages) };
-      const resolved = tc.expect.map((exp) => exp(ctx));
-      const evaluators = resolved.map((r) => r.evaluator);
-      const referenceOutputs = Object.assign({}, ...resolved.map((r) => r.referenceOutputs));
-      ls.test(
-        testName,
-        {
-          inputs: {
-            messages: tc.messages,
-            tools: toSerializableTools(tools)
+  _suites.push({ name, config });
+}
+function runEvals() {
+  const evalConfig = getEvalConfig();
+  ls.describe(evalConfig.experimentName, () => {
+    for (const { name: suiteName, config } of _suites) {
+      const suiteTools = config.tools ?? {};
+      const createTarget = config.target ? void 0 : resolveCreateTarget(config);
+      const categoryLabel = suiteName.charAt(0).toUpperCase() + suiteName.slice(1);
+      const model = typeof config.target === "string" ? config.target : evalConfig.model ?? "agent";
+      for (const tc of config.cases) {
+        const testName = tc.name ?? lastHumanContent(tc.messages);
+        const caseToolDefs = tc.tools ?? suiteTools;
+        const tools = toMockTools(caseToolDefs);
+        const ctx = { message: lastHumanContent(tc.messages) };
+        const resolved = tc.expect.map((exp) => exp(ctx));
+        const evaluators = resolved.map((r) => r.evaluator);
+        const referenceOutputs = Object.assign({}, ...resolved.map((r) => r.referenceOutputs));
+        const fullTestName = `[${categoryLabel}] > ${testName}`;
+        ls.test(
+          fullTestName,
+          {
+            inputs: {
+              name: fullTestName,
+              category: categoryLabel,
+              model,
+              tools: tools.map((t) => t.name).join(" | ") || "none",
+              messages: tc.messages
+            },
+            referenceOutputs
           },
-          referenceOutputs
-        },
-        async ({ referenceOutputs: refOut }) => {
-          let output;
-          const prepareMessages = tc.prepareMessages ?? config.prepareMessages ?? getEvalConfig().prepareMessages;
-          const preparedMessages = prepareMessages ? await prepareMessages(tc.messages) : tc.messages;
-          if (createTarget) {
-            output = await runAgentTarget(createTarget, preparedMessages, caseToolDefs);
-          } else {
-            const target = resolveModelTarget(config);
-            const globalPrompt = getEvalConfig().systemPrompt;
-            const systemPrompt = tc.systemPrompt ?? config.systemPrompt ?? globalPrompt;
-            output = await target({
-              messages: preparedMessages,
-              tools,
-              ...systemPrompt ? { systemPrompt } : {}
+          async ({ referenceOutputs: refOut }) => {
+            let output;
+            const prepareMessages = tc.prepareMessages ?? config.prepareMessages ?? getEvalConfig().prepareMessages;
+            const preparedMessages = prepareMessages ? await prepareMessages(tc.messages) : tc.messages;
+            if (createTarget) {
+              output = await runAgentTarget(createTarget, preparedMessages, caseToolDefs);
+            } else {
+              const target = resolveModelTarget(config);
+              const globalPrompt = getEvalConfig().systemPrompt;
+              const systemPrompt = tc.systemPrompt ?? config.systemPrompt ?? globalPrompt;
+              output = await target({
+                messages: preparedMessages,
+                tools,
+                ...systemPrompt ? { systemPrompt } : {}
+              });
+            }
+            const calledTools = output.messages.filter((m) => m instanceof import_messages2.AIMessage).flatMap((m) => m.tool_calls ?? []).map((tc2) => tc2.name);
+            ls.logOutputs({
+              tools_called: calledTools.length > 0 ? calledTools.join(" | ") : "none"
             });
+            for (const evaluator of evaluators) {
+              await evaluator({ outputs: output, referenceOutputs: refOut ?? {} });
+            }
           }
-          ls.logOutputs(output);
-          for (const evaluator of evaluators) {
-            await evaluator({ outputs: output, referenceOutputs: refOut ?? {} });
-          }
-        }
-      );
+        );
+      }
     }
   });
 }
@@ -511,7 +523,7 @@ var ls2 = __toESM(require("langsmith/vitest"));
 var import_agentevals = require("agentevals");
 // src/eval/evaluators/language.ts
-var import_messages2 = require("@langchain/core/messages");
+var import_messages3 = require("@langchain/core/messages");
 function createLanguageEvaluator(modelConfig, model) {
   const resolver = new LangchainModelResolver(modelConfig);
   const judge = resolver.resolve(model);
@@ -524,7 +536,7 @@ function createLanguageEvaluator(modelConfig, model) {
       return { key: "language_match", score: true, comment: "No expected language specified, skipping" };
     }
     const messages = outputs.messages || [];
-    const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages2.AIMessage);
+    const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages3.AIMessage);
     if (!lastAiMessage) {
       return { key: "language_match", score: false, comment: "No AI message found in trajectory" };
     }
@@ -550,7 +562,7 @@ function createLanguageEvaluator(modelConfig, model) {
 }
 // src/eval/evaluators/response-content.ts
-var import_messages3 = require("@langchain/core/messages");
+var import_messages4 = require("@langchain/core/messages");
 function createResponseContentEvaluator() {
   return async ({
     outputs,
@@ -562,7 +574,7 @@ function createResponseContentEvaluator() {
       return { key: "response_content", score: true, comment: "No content assertions specified, skipping" };
     }
     const messages = outputs.messages || [];
-    const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages3.AIMessage);
+    const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages4.AIMessage);
     if (!lastAiMessage) {
       return { key: "response_content", score: false, comment: "No AI message found in trajectory" };
     }
@@ -588,7 +600,7 @@ function createResponseContentEvaluator() {
 }
 // src/eval/evaluators/no-tool-calls.ts
-var import_messages4 = require("@langchain/core/messages");
+var import_messages5 = require("@langchain/core/messages");
 function createNoToolCallsEvaluator() {
   return async ({
     outputs,
@@ -598,8 +610,17 @@ function createNoToolCallsEvaluator() {
       return { key: "no_tool_calls", score: true, comment: "No tool call restriction specified, skipping" };
     }
     const messages = outputs.messages || [];
-    const toolCalls = messages.filter((m) => m instanceof import_messages4.AIMessage).flatMap((m) => m.tool_calls || []);
-    const passed = toolCalls.length === 0;
+    const exceptTools = referenceOutputs?.exceptTools ?? [];
+    const toolCalls = messages.filter((m) => m instanceof import_messages5.AIMessage).flatMap((m) => m.tool_calls || []);
+    const disallowedCalls = exceptTools.length > 0 ? toolCalls.filter((tc) => !exceptTools.includes(tc.name)) : toolCalls;
+    const passed = disallowedCalls.length === 0;
+    if (exceptTools.length > 0) {
+      return {
+        key: "no_tool_calls",
+        score: passed,
+        comment: passed ? `No disallowed tool calls made (allowed: ${exceptTools.join(", ")})` : `Agent made ${disallowedCalls.length} disallowed tool call(s): ${disallowedCalls.map((tc) => tc.name).join(", ")}`
+      };
+    }
     return {
       key: "no_tool_calls",
       score: passed,
@@ -608,6 +629,37 @@ function createNoToolCallsEvaluator() {
   };
 }
+// src/eval/evaluators/any-tool-called.ts
+var import_messages6 = require("@langchain/core/messages");
+function createAnyToolCalledEvaluator() {
+  return async ({
+    outputs,
+    referenceOutputs
+  }) => {
+    if (referenceOutputs?.expectAnyToolCall !== true) {
+      return { key: "any_tool_called", score: true, comment: "No any-tool-call expectation specified, skipping" };
+    }
+    const expectedTools = referenceOutputs?.anyToolsExpected ?? [];
+    const messages = outputs.messages || [];
+    const calledToolNames = messages.filter((m) => m instanceof import_messages6.AIMessage).flatMap((m) => m.tool_calls || []).map((tc) => tc.name);
+    if (expectedTools.length === 0) {
+      const passed2 = calledToolNames.length > 0;
+      return {
+        key: "any_tool_called",
+        score: passed2,
+        comment: passed2 ? `Agent called tool(s): ${calledToolNames.join(", ")}` : "Agent made no tool calls (expected at least one)"
+      };
+    }
+    const matchedTools = expectedTools.filter((name) => calledToolNames.includes(name));
+    const passed = matchedTools.length > 0;
+    return {
+      key: "any_tool_called",
+      score: passed,
+      comment: passed ? `Called expected tool(s): ${matchedTools.join(", ")}` : `None of the expected tools were called (expected one of: ${expectedTools.join(", ")}; actual: ${calledToolNames.length > 0 ? calledToolNames.join(", ") : "none"})`
+    };
+  };
+}
 // src/eval/expectations.ts
 function withTrajectoryGuard(evaluator, key) {
   return async ({ outputs, referenceOutputs }) => {
@@ -659,10 +711,13 @@ function llmJudge() {
     };
   };
 }
-function noTools() {
+function noTools(options) {
   return () => ({
     evaluator: ls2.wrapEvaluator(createNoToolCallsEvaluator()),
-    referenceOutputs: { expectNoToolCalls: true }
+    referenceOutputs: {
+      expectNoToolCalls: true,
+      ...options?.except?.length ? { exceptTools: options.except } : {}
+    }
   });
 }
 function respondsInLanguage(code) {
@@ -675,6 +730,15 @@ function respondsInLanguage(code) {
     };
   };
 }
+function anyToolCalled(tools) {
+  return () => ({
+    evaluator: ls2.wrapEvaluator(createAnyToolCalledEvaluator()),
+    referenceOutputs: {
+      expectAnyToolCall: true,
+      ...tools?.length ? { anyToolsExpected: tools } : {}
+    }
+  });
+}
 function contains(strings) {
   return () => ({
     evaluator: ls2.wrapEvaluator(createResponseContentEvaluator()),
@@ -690,6 +754,7 @@ function notContains(strings) {
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {
   ai,
+  anyToolCalled,
   configureEvals,
   contains,
   defineSuite,
@@ -699,6 +764,7 @@ function notContains(strings) {
   noTools,
   notContains,
   respondsInLanguage,
+  runEvals,
   toolResult,
   toolsCalled
 });