npm - @dvina/agents - Versions diffs - 0.14.0 → 0.17.0 - Mend

@dvina/agents 0.14.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/eval/index.d.mts +58 -13
package/dist/eval/index.d.ts +58 -13
package/dist/eval/index.js +672 -29
package/dist/eval/index.js.map +1 -1
package/dist/eval/index.mjs +673 -30
package/dist/eval/index.mjs.map +1 -1
package/dist/index.d.mts +2 -2
package/dist/index.d.ts +2 -2
package/dist/index.js +75 -0
package/dist/index.js.map +1 -1
package/dist/index.mjs +75 -0
package/dist/index.mjs.map +1 -1
package/dist/{model-resolver-DjKRXKtu.d.mts → model-resolver-DSJRvrqA.d.mts} +2 -5
package/dist/{model-resolver-DjKRXKtu.d.ts → model-resolver-DSJRvrqA.d.ts} +2 -5
package/package.json +1 -1

package/dist/eval/index.js CHANGED Viewed

@@ -251,9 +251,7 @@ function convertToLangchainMessages(messages) {
 var MAX_AGENT_LOOPS = 10;
 function stripReasoningBlocks(message) {
   if (!Array.isArray(message.content)) return message;
-  const filtered = message.content.filter(
-    (block) => block.type !== "reasoning" && block.type !== "thinking"
-  );
+  const filtered = message.content.filter((block) => block.type !== "reasoning" && block.type !== "thinking");
   const newContent = filtered.length > 0 ? filtered : "";
   return new import_messages.AIMessage({
     content: newContent,
@@ -299,6 +297,10 @@ function createEvalTarget(modelConfig, modelString) {
       messages.push(new import_messages.SystemMessage(inputs.systemPrompt));
     }
     messages.push(...convertToLangchainMessages(inputs.messages));
+    const stopTools = inputs.executionMode?.type === "stop-after-tool" ? inputs.executionMode.tools : [];
+    const stopCount = inputs.executionMode?.type === "stop-after-tool" ? inputs.executionMode.count ?? 1 : 1;
+    const singleTurn = inputs.executionMode?.type === "single-turn";
+    let cumulativeHits = 0;
     let loopCount = 0;
     while (loopCount < MAX_AGENT_LOOPS) {
       loopCount++;
@@ -308,6 +310,7 @@ function createEvalTarget(modelConfig, modelString) {
       if (!aiMessage.tool_calls || aiMessage.tool_calls.length === 0) {
         break;
       }
+      let shouldStop = false;
       for (const tc of aiMessage.tool_calls) {
         const mockTool = langchainTools.find((t) => t.name === tc.name);
         if (mockTool) {
@@ -328,6 +331,15 @@ function createEvalTarget(modelConfig, modelString) {
             })
           );
         }
+        if (stopTools.includes(tc.name)) {
+          cumulativeHits++;
+          if (cumulativeHits >= stopCount) {
+            shouldStop = true;
+          }
+        }
+      }
+      if (singleTurn || shouldStop) {
+        break;
       }
     }
     return { messages };
@@ -403,14 +415,94 @@ function toolDefsToDefinitions(defs) {
     };
   });
 }
-async function runAgentTarget(createTarget, model, evalMessages, extraToolDefs) {
+function wrapToolDefsForExecution(defs, tracker, abortController, executionMode) {
+  const stopTools = executionMode.type === "stop-after-tool" ? executionMode.tools : [];
+  const stopCount = executionMode.type === "stop-after-tool" ? executionMode.count ?? 1 : 1;
+  let cumulativeHits = 0;
+  return defs.map((def) => ({
+    ...def,
+    exec: async (input) => {
+      const result = await def.exec(input);
+      const output = typeof result === "string" ? result : JSON.stringify(result);
+      tracker.push({
+        name: def.name,
+        input,
+        output,
+        toolCallId: `eval_tc_${tracker.length}`
+      });
+      if (executionMode.type === "single-turn") {
+        abortController.abort();
+      } else if (stopTools.includes(def.name)) {
+        cumulativeHits++;
+        if (cumulativeHits >= stopCount) {
+          abortController.abort();
+        }
+      }
+      return result;
+    }
+  }));
+}
+function buildTrajectoryFromTrackedCalls(inputMessages, trackedCalls) {
+  const messages = convertToLangchainMessages(inputMessages);
+  if (trackedCalls.length > 0) {
+    messages.push(
+      new import_messages.AIMessage({
+        content: "",
+        tool_calls: trackedCalls.map((tc) => ({
+          id: tc.toolCallId,
+          name: tc.name,
+          args: tc.input
+        }))
+      })
+    );
+    for (const tc of trackedCalls) {
+      messages.push(
+        new import_messages.ToolMessage({
+          content: tc.output,
+          tool_call_id: tc.toolCallId,
+          name: tc.name
+        })
+      );
+    }
+  }
+  return messages;
+}
+async function runAgentTarget(createTarget, model, evalMessages, extraToolDefs, executionMode) {
   const extraTools = Object.keys(extraToolDefs).length > 0 ? toolDefsToDefinitions(extraToolDefs) : [];
-  const agent = await createTarget(model, extraTools);
-  const result = await agent.run({
-    threadId: `eval_${Date.now()}_${Math.random().toString(36).slice(2)}`,
-    messages: evalMessages
-  });
-  return { messages: agentResultToMessages(evalMessages, result) };
+  const tracker = [];
+  let abortController;
+  if (executionMode) {
+    abortController = new AbortController();
+  }
+  const wrapTools = executionMode && abortController ? (tools) => wrapToolDefsForExecution(tools, tracker, abortController, executionMode) : void 0;
+  const agent = await createTarget(model, extraTools, wrapTools);
+  const signal = abortController?.signal;
+  try {
+    const agentPromise = agent.run({
+      threadId: `eval_${Date.now()}_${Math.random().toString(36).slice(2)}`,
+      messages: evalMessages,
+      signal
+    });
+    if (abortController) {
+      const abortPromise = new Promise((_, reject) => {
+        const onAbort = () => reject(new DOMException("Eval execution aborted", "AbortError"));
+        if (signal.aborted) {
+          onAbort();
+          return;
+        }
+        signal.addEventListener("abort", onAbort, { once: true });
+      });
+      const result2 = await Promise.race([agentPromise, abortPromise]);
+      return { messages: agentResultToMessages(evalMessages, result2) };
+    }
+    const result = await agentPromise;
+    return { messages: agentResultToMessages(evalMessages, result) };
+  } catch (error) {
+    if (error.name === "AbortError" || signal?.aborted) {
+      return { messages: buildTrajectoryFromTrackedCalls(evalMessages, tracker) };
+    }
+    throw error;
+  }
 }
 // src/eval/suite.ts
@@ -504,7 +596,8 @@ function runEvals() {
                   createTarget,
                   currentModel,
                   preparedMessages,
-                  caseToolDefs
+                  caseToolDefs,
+                  tc.executionMode
                 );
               } else {
                 const target = resolveModelTarget(config, currentModel);
@@ -513,6 +606,7 @@ function runEvals() {
                 output = await target({
                   messages: preparedMessages,
                   tools,
+                  executionMode: tc.executionMode,
                   ...systemPrompt ? { systemPrompt } : {}
                 });
               }
@@ -574,8 +668,460 @@ function createLanguageEvaluator(modelConfig, model) {
   };
 }
-// src/eval/evaluators/response-content.ts
+// src/eval/evaluators/llm-judge.ts
+var import_messages6 = require("@langchain/core/messages");
+// node_modules/openevals/dist/utils.js
 var import_messages4 = require("@langchain/core/messages");
+var openAIImports = __toESM(require("@langchain/openai"), 1);
+var import_jestlike = require("langsmith/utils/jestlike");
+var import_traceable = require("langsmith/traceable");
+var {
+  // @ts-expect-error Shim for older versions of @langchain/openai
+  _convertMessagesToOpenAIParams,
+  convertMessagesToCompletionsMessageParams
+} = openAIImports;
+function _convertMessagesShim(message) {
+  if (typeof _convertMessagesToOpenAIParams === "function") {
+    return _convertMessagesToOpenAIParams([
+      message
+    ])[0];
+  }
+  return convertMessagesToCompletionsMessageParams({
+    messages: [message]
+  })[0];
+}
+var _convertToOpenAIMessage = (message) => {
+  if ((0, import_messages4.isBaseMessage)(message)) {
+    const converted = _convertMessagesShim(message);
+    if (message.id && !converted.id) {
+      converted.id = message.id;
+    }
+    return converted;
+  } else {
+    return message;
+  }
+};
+var _normalizeToOpenAIMessagesList = (messages) => {
+  let messagesList;
+  if (!Array.isArray(messages)) {
+    if ("messages" in messages && Array.isArray(messages.messages)) {
+      messagesList = messages.messages;
+    } else if ("content" in messages && "role" in messages) {
+      messagesList = [messages];
+    } else {
+      throw new Error(`If passing messages as an object, it must contain a "messages" key`);
+    }
+  } else {
+    messagesList = messages;
+  }
+  return messagesList.map(_convertToOpenAIMessage);
+};
+var processScore = (_, value) => {
+  if (typeof value === "object") {
+    if (value != null && "score" in value) {
+      return [
+        value.score,
+        "reasoning" in value && typeof value.reasoning === "string" ? value.reasoning : void 0,
+        value.metadata,
+        value.sourceRunId
+      ];
+    } else {
+      throw new Error(`Expected a dictionary with a "score" key, but got "${JSON.stringify(value, null, 2)}"`);
+    }
+  }
+  return [value];
+};
+async function _runEvaluatorUntyped(runName, scorer, feedbackKey, extra, ls_framework, returnRawOutputs) {
+  const runScorer = async (params) => {
+    let score = await scorer(params);
+    if (returnRawOutputs) {
+      return score;
+    }
+    let reasoning;
+    if (!Array.isArray(score) && typeof score === "object") {
+      const results = [];
+      for (const [key, value] of Object.entries(score)) {
+        const [keyScore, reasoning2, metadata, sourceRunId] = processScore(
+          key,
+          // eslint-disable-next-line @typescript-eslint/no-explicit-any
+          value
+        );
+        const result = {
+          key,
+          score: keyScore,
+          comment: reasoning2,
+          metadata
+        };
+        if (sourceRunId !== void 0 && typeof sourceRunId === "string") {
+          result.sourceRunId = sourceRunId;
+        }
+        results.push(result);
+      }
+      return results;
+    } else {
+      let metadata;
+      if (Array.isArray(score)) {
+        metadata = score[2];
+        reasoning = score[1];
+        score = score[0];
+      }
+      return {
+        key: feedbackKey,
+        score,
+        comment: reasoning,
+        metadata
+      };
+    }
+  };
+  if ((0, import_jestlike.isInTestContext)()) {
+    const res = await (0, import_jestlike.wrapEvaluator)(runScorer)(extra ?? {}, {
+      name: runName,
+      metadata: {
+        __ls_framework: ls_framework ?? "openevals",
+        __ls_evaluator: runName,
+        __ls_language: "js"
+      }
+    });
+    if (returnRawOutputs) {
+      const rawResults = res;
+      return rawResults;
+    }
+    return res;
+  } else {
+    const traceableRunScorer = (0, import_traceable.traceable)(runScorer, {
+      name: runName,
+      metadata: {
+        __ls_framework: ls_framework ?? "openevals",
+        __ls_evaluator: runName,
+        __ls_language: "js"
+      }
+    });
+    const res = await traceableRunScorer(extra ?? {});
+    return res;
+  }
+}
+// node_modules/openevals/dist/json/match.js
+var import_traceable3 = require("langsmith/traceable");
+// node_modules/openevals/dist/llm.js
+var import_runnables = require("@langchain/core/runnables");
+var import_prompts = require("@langchain/core/prompts");
+var import_messages5 = require("@langchain/core/messages");
+var import_json_schema = require("@langchain/core/utils/json_schema");
+var import_universal = require("langchain/chat_models/universal");
+var import_traceable2 = require("langsmith/traceable");
+function _isRunnableInterface(prompt) {
+  return import_runnables.Runnable.isRunnable(prompt);
+}
+function _isStructuredPrompt(prompt) {
+  return _isRunnableInterface(prompt) && "schema" in prompt && prompt.schema != null;
+}
+function isZodSchema(input) {
+  return typeof input?.parse === "function";
+}
+function _isBaseChatModel(x) {
+  const model = x;
+  return x != null && typeof x === "object" && typeof model._modelType === "function" && model._modelType() === "base_chat_model";
+}
+function appendFewShotExamples({ messages, fewShotExamples }) {
+  const lastUserMessageIdx = messages.slice().reverse().findIndex((msg) => msg.role === "user");
+  if (lastUserMessageIdx === -1) {
+    throw new Error("Appending few-shot examples requires a user message in the provided prompt");
+  }
+  const actualIdx = messages.length - 1 - lastUserMessageIdx;
+  messages[actualIdx].content += "\n\n" + fewShotExamples.map((example) => {
+    let exampleStr = `<example>
+<input>${JSON.stringify(example.inputs)}</input>
+<output>${JSON.stringify(example.outputs)}</output>`;
+    if (example.reasoning) {
+      exampleStr += `
+<reasoning>${example.reasoning}</reasoning>`;
+    }
+    if (example.score !== void 0) {
+      exampleStr += `
+<score>${example.score}</score>`;
+    }
+    exampleStr += "\n</example>";
+    return exampleStr;
+  }).join("\n");
+  return messages;
+}
+function constructDefaultOutputJsonSchema({ continuous, choices, useReasoning }) {
+  const jsonSchema = {
+    type: "object",
+    additionalProperties: false
+  };
+  let description;
+  let scoreSchema;
+  if (choices) {
+    description = "A number that represents the degree to which the criteria in the prompt are met.";
+    scoreSchema = {
+      type: "number",
+      description,
+      enum: choices
+    };
+  } else if (continuous) {
+    description = "A number that represents the degree to which the criteria in the prompt are met, from 0.0 to 1.0. 1.0 means the criteria are met perfectly. 0.0 means none of the criteria are met, 0.5 means exactly half of the criteria are met.";
+    scoreSchema = {
+      type: "number",
+      description
+    };
+  } else {
+    description = "A score that is true if criteria in the prompt are met, and false otherwise.";
+    scoreSchema = {
+      type: "boolean",
+      description
+    };
+  }
+  if (useReasoning) {
+    jsonSchema.properties = {
+      reasoning: {
+        type: "string",
+        description: "A human-readable explanation of the score. You MUST end the reasoning with a sentence that says: Thus, the score should be: SCORE_YOU_ASSIGN."
+      },
+      score: scoreSchema
+    };
+    jsonSchema.required = ["reasoning", "score"];
+  } else {
+    jsonSchema.properties = {
+      score: scoreSchema
+    };
+    jsonSchema.required = ["score"];
+  }
+  return [jsonSchema, description];
+}
+function _stringifyPromptParam(param) {
+  if (typeof param === "string") {
+    return param;
+  } else if ((0, import_messages5.isBaseMessage)(param)) {
+    return JSON.stringify(_convertToOpenAIMessage(param));
+  } else if (typeof param === "object" && param !== null) {
+    if (Array.isArray(param)) {
+      return JSON.stringify(param.map((message) => (0, import_messages5.isBaseMessage)(message) ? _convertToOpenAIMessage(message) : message));
+    }
+    const objParam = param;
+    if ("messages" in objParam && Array.isArray(objParam.messages)) {
+      objParam.messages = objParam.messages.map((message) => (0, import_messages5.isBaseMessage)(message) ? _convertToOpenAIMessage(message) : message);
+      return JSON.stringify(objParam);
+    }
+    return JSON.stringify(param);
+  }
+  return JSON.stringify(param);
+}
+var _createLLMAsJudgeScorer = (params) => {
+  const { prompt, system, model, continuous, choices, fewShotExamples } = params;
+  let schema;
+  if (isZodSchema(params.schema)) {
+    schema = (0, import_json_schema.toJsonSchema)(params.schema);
+  } else {
+    schema = params.schema;
+  }
+  let judge = params.judge;
+  const useReasoning = params.useReasoning ?? true;
+  const getScore = async (params2) => {
+    const { inputs, outputs, referenceOutputs, ...rest } = params2;
+    if (system && typeof prompt !== "string") {
+      throw new Error("`system` is only supported when `prompt` is a string template");
+    }
+    let stringifiedInputs = inputs;
+    let stringifiedOutputs = outputs;
+    let stringifiedReferenceOutputs = referenceOutputs;
+    if (inputs) {
+      stringifiedInputs = _stringifyPromptParam(inputs);
+    }
+    if (outputs) {
+      stringifiedOutputs = _stringifyPromptParam(outputs);
+    }
+    if (referenceOutputs) {
+      stringifiedReferenceOutputs = _stringifyPromptParam(referenceOutputs);
+    }
+    const stringifiedRest = Object.fromEntries(Object.entries(rest).map(([key, value]) => [
+      key,
+      _stringifyPromptParam(value)
+    ]));
+    let messages = [];
+    const promptParams = {
+      inputs: stringifiedInputs,
+      outputs: stringifiedOutputs,
+      reference_outputs: stringifiedReferenceOutputs,
+      ...stringifiedRest
+    };
+    const filteredPromptParams = Object.fromEntries(Object.entries(promptParams).filter(([_, value]) => value !== void 0));
+    if (_isRunnableInterface(prompt)) {
+      const formattedPrompt = await prompt.invoke(filteredPromptParams);
+      messages = formattedPrompt.messages;
+      if (_isStructuredPrompt(prompt)) {
+        schema = prompt.schema;
+      }
+    } else if (typeof prompt === "string") {
+      const template = import_prompts.ChatPromptTemplate.fromTemplate(prompt);
+      const formattedPrompt = await template.invoke(filteredPromptParams);
+      messages = formattedPrompt.messages;
+    } else {
+      messages = await prompt({
+        inputs,
+        outputs,
+        reference_outputs: referenceOutputs,
+        ...rest
+      });
+    }
+    if (system) {
+      messages = [{ role: "system", content: system }, ...messages];
+    }
+    let normalizedMessages = _normalizeToOpenAIMessagesList(messages);
+    if (fewShotExamples) {
+      normalizedMessages = appendFewShotExamples({
+        messages: normalizedMessages,
+        fewShotExamples
+      });
+    }
+    const [defaultJsonSchema, description] = constructDefaultOutputJsonSchema({
+      continuous,
+      choices,
+      useReasoning
+    });
+    if (!judge) {
+      if (!model) {
+        throw new Error("`model` string is required (e.g. 'openai:o3-mini') when `judge` is not provided");
+      }
+      judge = await (0, import_universal.initChatModel)(model);
+    }
+    let response;
+    if (_isBaseChatModel(judge)) {
+      const judgeWithStructuredOutput = judge.withStructuredOutput(schema ?? {
+        title: "score",
+        description,
+        ...defaultJsonSchema
+      });
+      response = await judgeWithStructuredOutput.invoke(normalizedMessages);
+      if (schema === void 0) {
+        if (useReasoning) {
+          return [response.score, response.reasoning];
+        }
+        return response.score;
+      } else {
+        return response;
+      }
+    } else {
+      if (!model) {
+        throw new Error("`model` string is required (e.g. 'openai:o3-mini') when `judge` is an OpenAI client");
+      }
+      let openaiJsonSchema = schema ?? defaultJsonSchema;
+      if (openaiJsonSchema.name === void 0) {
+        openaiJsonSchema = {
+          name: "score",
+          strict: true,
+          schema: openaiJsonSchema
+        };
+      }
+      if (openaiJsonSchema.schema == null || typeof openaiJsonSchema.schema !== "object") {
+        throw new Error("`ouputSchema` must be JSON schema or OpenAI structured output format when using an OpenAI client directly");
+      }
+      if (!("additionalProperties" in openaiJsonSchema.schema)) {
+        openaiJsonSchema.schema.additionalProperties = false;
+      }
+      const params3 = {
+        messages: normalizedMessages,
+        model: model.startsWith("openai:") ? model.slice("openai:".length) : model,
+        response_format: {
+          type: "json_schema",
+          json_schema: openaiJsonSchema
+        }
+      };
+      const invokeLlm = (0, import_traceable2.traceable)(judge.chat.completions.create.bind(judge.chat.completions), {
+        metadata: {
+          ls_provider: "openai",
+          ls_model_name: model,
+          ls_model_type: "chat"
+        },
+        run_type: "llm",
+        name: "OpenAI Chat Completion"
+      });
+      const response2 = await invokeLlm(params3);
+      const parsed = JSON.parse(response2.choices[0].message.content);
+      if (schema === void 0) {
+        if (useReasoning) {
+          return [parsed.score, parsed.reasoning];
+        }
+        return parsed.score;
+      }
+      return parsed;
+    }
+  };
+  return getScore;
+};
+function createLLMAsJudge({ prompt, feedbackKey = "score", model, system, judge, continuous = false, choices, useReasoning = true, fewShotExamples, outputSchema }) {
+  if (outputSchema !== void 0 && _isStructuredPrompt(prompt)) {
+    throw new Error("You may not provide both an `outputSchema` parameter and a LangChain prompt with output schema.");
+  }
+  const scorer = _createLLMAsJudgeScorer({
+    prompt,
+    judge,
+    model,
+    system,
+    continuous,
+    choices,
+    useReasoning,
+    fewShotExamples,
+    schema: outputSchema
+  });
+  const _wrappedEvaluator = async (inputs) => {
+    const runName = feedbackKey !== "score" ? "llm_as_judge" : `llm_as_${feedbackKey}_judge`;
+    return _runEvaluatorUntyped(runName, scorer, feedbackKey, inputs, void 0, outputSchema !== void 0 || _isStructuredPrompt(prompt));
+  };
+  return _wrappedEvaluator;
+}
+// node_modules/openevals/dist/code/base.js
+var import_universal2 = require("langchain/chat_models/universal");
+var import_prompts2 = require("@langchain/core/prompts");
+// node_modules/openevals/dist/simulators/multiturn.js
+var import_traceable4 = require("langsmith/traceable");
+// node_modules/openevals/dist/simulators/prebuilts.js
+var import_universal3 = require("langchain/chat_models/universal");
+// src/eval/evaluators/llm-judge.ts
+var RESPONSE_CRITERIA_PROMPT = `You are an expert evaluator.
+Assess the following AI response based on the given criteria.
+<Criteria>
+{criteria}
+</Criteria>
+<Response>
+{outputs}
+</Response>
+Grade whether the response meets the criteria.`;
+function createLlmJudgeEvaluator(modelConfig, model, criteria) {
+  const resolver = new LangchainModelResolver(modelConfig);
+  const judge = resolver.resolve(model);
+  const llmJudge2 = createLLMAsJudge({
+    prompt: RESPONSE_CRITERIA_PROMPT,
+    feedbackKey: "llm_judge",
+    judge,
+    useReasoning: true
+  });
+  return async ({ outputs }) => {
+    const messages = outputs.messages || [];
+    const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages6.AIMessage);
+    if (!lastAiMessage) {
+      return { key: "llm_judge", score: false, comment: "No AI message found in trajectory" };
+    }
+    const responseText = typeof lastAiMessage.content === "string" ? lastAiMessage.content : JSON.stringify(lastAiMessage.content);
+    return llmJudge2({
+      outputs: responseText,
+      criteria
+    });
+  };
+}
+// src/eval/evaluators/response-content.ts
+var import_messages7 = require("@langchain/core/messages");
 function createResponseContentEvaluator() {
   return async ({
     outputs,
@@ -587,7 +1133,7 @@ function createResponseContentEvaluator() {
       return { key: "response_content", score: true, comment: "No content assertions specified, skipping" };
     }
     const messages = outputs.messages || [];
-    const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages4.AIMessage);
+    const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages7.AIMessage);
     if (!lastAiMessage) {
       return { key: "response_content", score: false, comment: "No AI message found in trajectory" };
     }
@@ -613,7 +1159,7 @@ function createResponseContentEvaluator() {
 }
 // src/eval/evaluators/no-tool-calls.ts
-var import_messages5 = require("@langchain/core/messages");
+var import_messages8 = require("@langchain/core/messages");
 function createNoToolCallsEvaluator() {
   return async ({
     outputs,
@@ -624,7 +1170,7 @@ function createNoToolCallsEvaluator() {
     }
     const messages = outputs.messages || [];
     const exceptTools = referenceOutputs?.exceptTools ?? [];
-    const toolCalls = messages.filter((m) => m instanceof import_messages5.AIMessage).flatMap((m) => m.tool_calls || []);
+    const toolCalls = messages.filter((m) => m instanceof import_messages8.AIMessage).flatMap((m) => m.tool_calls || []);
     const disallowedCalls = exceptTools.length > 0 ? toolCalls.filter((tc) => !exceptTools.includes(tc.name)) : toolCalls;
     const passed = disallowedCalls.length === 0;
     if (exceptTools.length > 0) {
@@ -643,7 +1189,7 @@ function createNoToolCallsEvaluator() {
 }
 // src/eval/evaluators/any-tool-called.ts
-var import_messages6 = require("@langchain/core/messages");
+var import_messages9 = require("@langchain/core/messages");
 function createAnyToolCalledEvaluator() {
   return async ({
     outputs,
@@ -654,7 +1200,7 @@ function createAnyToolCalledEvaluator() {
     }
     const expectedTools = referenceOutputs?.anyToolsExpected ?? [];
     const messages = outputs.messages || [];
-    const calledToolNames = messages.filter((m) => m instanceof import_messages6.AIMessage).flatMap((m) => m.tool_calls || []).map((tc) => tc.name);
+    const calledToolNames = messages.filter((m) => m instanceof import_messages9.AIMessage).flatMap((m) => m.tool_calls || []).map((tc) => tc.name);
     if (expectedTools.length === 0) {
       const passed2 = calledToolNames.length > 0;
       return {
@@ -673,6 +1219,82 @@ function createAnyToolCalledEvaluator() {
   };
 }
+// src/eval/evaluators/tool-input.ts
+var import_messages10 = require("@langchain/core/messages");
+function createToolInputEvaluator() {
+  return async ({
+    outputs,
+    referenceOutputs
+  }) => {
+    const expectations = referenceOutputs?.toolInputExpectations ?? [];
+    if (expectations.length === 0) {
+      return { key: "tool_input", score: true, comment: "No tool input expectations specified, skipping" };
+    }
+    const messages = outputs.messages || [];
+    const allToolCalls = messages.filter((m) => m instanceof import_messages10.AIMessage).flatMap((m) => m.tool_calls || []);
+    const results = [];
+    for (const expectation of expectations) {
+      const matchingCalls = allToolCalls.filter((tc) => tc.name === expectation.name);
+      const subChecks = [];
+      let passed = true;
+      if (expectation.times !== void 0) {
+        const countOk = matchingCalls.length >= expectation.times;
+        if (!countOk) {
+          passed = false;
+          subChecks.push(
+            `expected at least ${expectation.times} call(s), got ${matchingCalls.length}`
+          );
+        } else {
+          subChecks.push(
+            `call count OK (${matchingCalls.length} >= ${expectation.times})`
+          );
+        }
+      }
+      if (expectation.validate) {
+        if (matchingCalls.length === 0) {
+          passed = false;
+          subChecks.push("was never called");
+        } else {
+          const anyValid = matchingCalls.some((tc) => {
+            try {
+              return expectation.validate(tc.args);
+            } catch {
+              return false;
+            }
+          });
+          if (!anyValid) {
+            passed = false;
+            subChecks.push(
+              `input validation failed for all ${matchingCalls.length} call(s)`
+            );
+          } else {
+            subChecks.push("input validation passed");
+          }
+        }
+      }
+      if (expectation.times === void 0 && !expectation.validate) {
+        if (matchingCalls.length === 0) {
+          passed = false;
+          subChecks.push("was never called");
+        } else {
+          subChecks.push(`called ${matchingCalls.length} time(s)`);
+        }
+      }
+      results.push({
+        name: expectation.name,
+        passed,
+        comment: `"${expectation.name}": ${subChecks.join(", ")}`
+      });
+    }
+    const allPassed = results.every((r) => r.passed);
+    return {
+      key: "tool_input",
+      score: allPassed,
+      comment: results.map((r) => r.comment).join("; ")
+    };
+  };
+}
 // src/eval/expectations.ts
 function withTrajectoryGuard(evaluator, key) {
   return async ({ outputs, referenceOutputs }) => {
@@ -699,27 +1321,48 @@ function buildTrajectory(message, toolNames) {
   return trajectory;
 }
 function toolsCalled(tools) {
-  return (ctx) => ({
-    evaluator: ls2.wrapEvaluator(
+  const toolNames = tools.map((t) => typeof t === "string" ? t : t.name);
+  const validators = tools.filter((t) => typeof t !== "string");
+  return (ctx) => {
+    const trajectoryEvaluator = ls2.wrapEvaluator(
       withTrajectoryGuard(
         (0, import_agentevals.createTrajectoryMatchEvaluator)({ trajectoryMatchMode: "superset", toolArgsMatchMode: "ignore" }),
         "trajectory_match"
       )
-    ),
-    referenceOutputs: { referenceTrajectory: buildTrajectory(ctx.message, tools) }
-  });
+    );
+    if (validators.length === 0) {
+      return {
+        evaluator: trajectoryEvaluator,
+        referenceOutputs: { referenceTrajectory: buildTrajectory(ctx.message, toolNames) }
+      };
+    }
+    const inputEvaluator = ls2.wrapEvaluator(createToolInputEvaluator());
+    const composedEvaluator = async ({ outputs, referenceOutputs }) => {
+      const trajectoryResult = await trajectoryEvaluator({ outputs, referenceOutputs });
+      const inputResult = await inputEvaluator({
+        outputs,
+        referenceOutputs: { ...referenceOutputs, toolInputExpectations: validators }
+      });
+      const trajectoryPassed = Boolean(trajectoryResult.score);
+      const inputPassed = Boolean(inputResult.score);
+      return {
+        key: "tools_called",
+        score: trajectoryPassed && inputPassed,
+        comment: [trajectoryResult.comment, inputResult.comment].filter(Boolean).join("; ")
+      };
+    };
+    return {
+      evaluator: composedEvaluator,
+      referenceOutputs: { referenceTrajectory: buildTrajectory(ctx.message, toolNames) }
+    };
+  };
 }
-function llmJudge() {
+function llmJudge(criteria) {
   return () => {
     const config = getEvalConfig();
     const model = config.evaluatorModel;
     return {
-      evaluator: ls2.wrapEvaluator(
-        withTrajectoryGuard(
-          (0, import_agentevals.createTrajectoryLLMAsJudge)({ prompt: import_agentevals.TRAJECTORY_ACCURACY_PROMPT, model }),
-          "trajectory_llm_judge"
-        )
-      ),
+      evaluator: ls2.wrapEvaluator(createLlmJudgeEvaluator(config.modelConfig, model, criteria)),
       referenceOutputs: {}
     };
   };