@dvina/agents 0.14.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -251,9 +251,7 @@ function convertToLangchainMessages(messages) {
251
251
  var MAX_AGENT_LOOPS = 10;
252
252
  function stripReasoningBlocks(message) {
253
253
  if (!Array.isArray(message.content)) return message;
254
- const filtered = message.content.filter(
255
- (block) => block.type !== "reasoning" && block.type !== "thinking"
256
- );
254
+ const filtered = message.content.filter((block) => block.type !== "reasoning" && block.type !== "thinking");
257
255
  const newContent = filtered.length > 0 ? filtered : "";
258
256
  return new import_messages.AIMessage({
259
257
  content: newContent,
@@ -299,6 +297,10 @@ function createEvalTarget(modelConfig, modelString) {
299
297
  messages.push(new import_messages.SystemMessage(inputs.systemPrompt));
300
298
  }
301
299
  messages.push(...convertToLangchainMessages(inputs.messages));
300
+ const stopTools = inputs.executionMode?.type === "stop-after-tool" ? inputs.executionMode.tools : [];
301
+ const stopCount = inputs.executionMode?.type === "stop-after-tool" ? inputs.executionMode.count ?? 1 : 1;
302
+ const singleTurn = inputs.executionMode?.type === "single-turn";
303
+ let cumulativeHits = 0;
302
304
  let loopCount = 0;
303
305
  while (loopCount < MAX_AGENT_LOOPS) {
304
306
  loopCount++;
@@ -308,6 +310,7 @@ function createEvalTarget(modelConfig, modelString) {
308
310
  if (!aiMessage.tool_calls || aiMessage.tool_calls.length === 0) {
309
311
  break;
310
312
  }
313
+ let shouldStop = false;
311
314
  for (const tc of aiMessage.tool_calls) {
312
315
  const mockTool = langchainTools.find((t) => t.name === tc.name);
313
316
  if (mockTool) {
@@ -328,6 +331,15 @@ function createEvalTarget(modelConfig, modelString) {
328
331
  })
329
332
  );
330
333
  }
334
+ if (stopTools.includes(tc.name)) {
335
+ cumulativeHits++;
336
+ if (cumulativeHits >= stopCount) {
337
+ shouldStop = true;
338
+ }
339
+ }
340
+ }
341
+ if (singleTurn || shouldStop) {
342
+ break;
331
343
  }
332
344
  }
333
345
  return { messages };
@@ -403,14 +415,94 @@ function toolDefsToDefinitions(defs) {
403
415
  };
404
416
  });
405
417
  }
406
- async function runAgentTarget(createTarget, model, evalMessages, extraToolDefs) {
418
+ function wrapToolDefsForExecution(defs, tracker, abortController, executionMode) {
419
+ const stopTools = executionMode.type === "stop-after-tool" ? executionMode.tools : [];
420
+ const stopCount = executionMode.type === "stop-after-tool" ? executionMode.count ?? 1 : 1;
421
+ let cumulativeHits = 0;
422
+ return defs.map((def) => ({
423
+ ...def,
424
+ exec: async (input) => {
425
+ const result = await def.exec(input);
426
+ const output = typeof result === "string" ? result : JSON.stringify(result);
427
+ tracker.push({
428
+ name: def.name,
429
+ input,
430
+ output,
431
+ toolCallId: `eval_tc_${tracker.length}`
432
+ });
433
+ if (executionMode.type === "single-turn") {
434
+ abortController.abort();
435
+ } else if (stopTools.includes(def.name)) {
436
+ cumulativeHits++;
437
+ if (cumulativeHits >= stopCount) {
438
+ abortController.abort();
439
+ }
440
+ }
441
+ return result;
442
+ }
443
+ }));
444
+ }
445
+ function buildTrajectoryFromTrackedCalls(inputMessages, trackedCalls) {
446
+ const messages = convertToLangchainMessages(inputMessages);
447
+ if (trackedCalls.length > 0) {
448
+ messages.push(
449
+ new import_messages.AIMessage({
450
+ content: "",
451
+ tool_calls: trackedCalls.map((tc) => ({
452
+ id: tc.toolCallId,
453
+ name: tc.name,
454
+ args: tc.input
455
+ }))
456
+ })
457
+ );
458
+ for (const tc of trackedCalls) {
459
+ messages.push(
460
+ new import_messages.ToolMessage({
461
+ content: tc.output,
462
+ tool_call_id: tc.toolCallId,
463
+ name: tc.name
464
+ })
465
+ );
466
+ }
467
+ }
468
+ return messages;
469
+ }
470
+ async function runAgentTarget(createTarget, model, evalMessages, extraToolDefs, executionMode) {
407
471
  const extraTools = Object.keys(extraToolDefs).length > 0 ? toolDefsToDefinitions(extraToolDefs) : [];
408
- const agent = await createTarget(model, extraTools);
409
- const result = await agent.run({
410
- threadId: `eval_${Date.now()}_${Math.random().toString(36).slice(2)}`,
411
- messages: evalMessages
412
- });
413
- return { messages: agentResultToMessages(evalMessages, result) };
472
+ const tracker = [];
473
+ let abortController;
474
+ if (executionMode) {
475
+ abortController = new AbortController();
476
+ }
477
+ const wrapTools = executionMode && abortController ? (tools) => wrapToolDefsForExecution(tools, tracker, abortController, executionMode) : void 0;
478
+ const agent = await createTarget(model, extraTools, wrapTools);
479
+ const signal = abortController?.signal;
480
+ try {
481
+ const agentPromise = agent.run({
482
+ threadId: `eval_${Date.now()}_${Math.random().toString(36).slice(2)}`,
483
+ messages: evalMessages,
484
+ signal
485
+ });
486
+ if (abortController) {
487
+ const abortPromise = new Promise((_, reject) => {
488
+ const onAbort = () => reject(new DOMException("Eval execution aborted", "AbortError"));
489
+ if (signal.aborted) {
490
+ onAbort();
491
+ return;
492
+ }
493
+ signal.addEventListener("abort", onAbort, { once: true });
494
+ });
495
+ const result2 = await Promise.race([agentPromise, abortPromise]);
496
+ return { messages: agentResultToMessages(evalMessages, result2) };
497
+ }
498
+ const result = await agentPromise;
499
+ return { messages: agentResultToMessages(evalMessages, result) };
500
+ } catch (error) {
501
+ if (error.name === "AbortError" || signal?.aborted) {
502
+ return { messages: buildTrajectoryFromTrackedCalls(evalMessages, tracker) };
503
+ }
504
+ throw error;
505
+ }
414
506
  }
415
507
 
416
508
  // src/eval/suite.ts
@@ -504,7 +596,8 @@ function runEvals() {
504
596
  createTarget,
505
597
  currentModel,
506
598
  preparedMessages,
507
- caseToolDefs
599
+ caseToolDefs,
600
+ tc.executionMode
508
601
  );
509
602
  } else {
510
603
  const target = resolveModelTarget(config, currentModel);
@@ -513,6 +606,7 @@ function runEvals() {
513
606
  output = await target({
514
607
  messages: preparedMessages,
515
608
  tools,
609
+ executionMode: tc.executionMode,
516
610
  ...systemPrompt ? { systemPrompt } : {}
517
611
  });
518
612
  }
@@ -574,8 +668,460 @@ function createLanguageEvaluator(modelConfig, model) {
574
668
  };
575
669
  }
576
670
 
577
- // src/eval/evaluators/response-content.ts
671
+ // src/eval/evaluators/llm-judge.ts
672
+ var import_messages6 = require("@langchain/core/messages");
673
+
674
+ // node_modules/openevals/dist/utils.js
578
675
  var import_messages4 = require("@langchain/core/messages");
676
+ var openAIImports = __toESM(require("@langchain/openai"), 1);
677
+ var import_jestlike = require("langsmith/utils/jestlike");
678
+ var import_traceable = require("langsmith/traceable");
679
+ var {
680
+ // @ts-expect-error Shim for older versions of @langchain/openai
681
+ _convertMessagesToOpenAIParams,
682
+ convertMessagesToCompletionsMessageParams
683
+ } = openAIImports;
684
+ function _convertMessagesShim(message) {
685
+ if (typeof _convertMessagesToOpenAIParams === "function") {
686
+ return _convertMessagesToOpenAIParams([
687
+ message
688
+ ])[0];
689
+ }
690
+ return convertMessagesToCompletionsMessageParams({
691
+ messages: [message]
692
+ })[0];
693
+ }
694
+ var _convertToOpenAIMessage = (message) => {
695
+ if ((0, import_messages4.isBaseMessage)(message)) {
696
+ const converted = _convertMessagesShim(message);
697
+ if (message.id && !converted.id) {
698
+ converted.id = message.id;
699
+ }
700
+ return converted;
701
+ } else {
702
+ return message;
703
+ }
704
+ };
705
+ var _normalizeToOpenAIMessagesList = (messages) => {
706
+ let messagesList;
707
+ if (!Array.isArray(messages)) {
708
+ if ("messages" in messages && Array.isArray(messages.messages)) {
709
+ messagesList = messages.messages;
710
+ } else if ("content" in messages && "role" in messages) {
711
+ messagesList = [messages];
712
+ } else {
713
+ throw new Error(`If passing messages as an object, it must contain a "messages" key`);
714
+ }
715
+ } else {
716
+ messagesList = messages;
717
+ }
718
+ return messagesList.map(_convertToOpenAIMessage);
719
+ };
720
+ var processScore = (_, value) => {
721
+ if (typeof value === "object") {
722
+ if (value != null && "score" in value) {
723
+ return [
724
+ value.score,
725
+ "reasoning" in value && typeof value.reasoning === "string" ? value.reasoning : void 0,
726
+ value.metadata,
727
+ value.sourceRunId
728
+ ];
729
+ } else {
730
+ throw new Error(`Expected a dictionary with a "score" key, but got "${JSON.stringify(value, null, 2)}"`);
731
+ }
732
+ }
733
+ return [value];
734
+ };
735
+ async function _runEvaluatorUntyped(runName, scorer, feedbackKey, extra, ls_framework, returnRawOutputs) {
736
+ const runScorer = async (params) => {
737
+ let score = await scorer(params);
738
+ if (returnRawOutputs) {
739
+ return score;
740
+ }
741
+ let reasoning;
742
+ if (!Array.isArray(score) && typeof score === "object") {
743
+ const results = [];
744
+ for (const [key, value] of Object.entries(score)) {
745
+ const [keyScore, reasoning2, metadata, sourceRunId] = processScore(
746
+ key,
747
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
748
+ value
749
+ );
750
+ const result = {
751
+ key,
752
+ score: keyScore,
753
+ comment: reasoning2,
754
+ metadata
755
+ };
756
+ if (sourceRunId !== void 0 && typeof sourceRunId === "string") {
757
+ result.sourceRunId = sourceRunId;
758
+ }
759
+ results.push(result);
760
+ }
761
+ return results;
762
+ } else {
763
+ let metadata;
764
+ if (Array.isArray(score)) {
765
+ metadata = score[2];
766
+ reasoning = score[1];
767
+ score = score[0];
768
+ }
769
+ return {
770
+ key: feedbackKey,
771
+ score,
772
+ comment: reasoning,
773
+ metadata
774
+ };
775
+ }
776
+ };
777
+ if ((0, import_jestlike.isInTestContext)()) {
778
+ const res = await (0, import_jestlike.wrapEvaluator)(runScorer)(extra ?? {}, {
779
+ name: runName,
780
+ metadata: {
781
+ __ls_framework: ls_framework ?? "openevals",
782
+ __ls_evaluator: runName,
783
+ __ls_language: "js"
784
+ }
785
+ });
786
+ if (returnRawOutputs) {
787
+ const rawResults = res;
788
+ return rawResults;
789
+ }
790
+ return res;
791
+ } else {
792
+ const traceableRunScorer = (0, import_traceable.traceable)(runScorer, {
793
+ name: runName,
794
+ metadata: {
795
+ __ls_framework: ls_framework ?? "openevals",
796
+ __ls_evaluator: runName,
797
+ __ls_language: "js"
798
+ }
799
+ });
800
+ const res = await traceableRunScorer(extra ?? {});
801
+ return res;
802
+ }
803
+ }
804
+
805
+ // node_modules/openevals/dist/json/match.js
806
+ var import_traceable3 = require("langsmith/traceable");
807
+
808
+ // node_modules/openevals/dist/llm.js
809
+ var import_runnables = require("@langchain/core/runnables");
810
+ var import_prompts = require("@langchain/core/prompts");
811
+ var import_messages5 = require("@langchain/core/messages");
812
+ var import_json_schema = require("@langchain/core/utils/json_schema");
813
+ var import_universal = require("langchain/chat_models/universal");
814
+ var import_traceable2 = require("langsmith/traceable");
815
+ function _isRunnableInterface(prompt) {
816
+ return import_runnables.Runnable.isRunnable(prompt);
817
+ }
818
+ function _isStructuredPrompt(prompt) {
819
+ return _isRunnableInterface(prompt) && "schema" in prompt && prompt.schema != null;
820
+ }
821
+ function isZodSchema(input) {
822
+ return typeof input?.parse === "function";
823
+ }
824
+ function _isBaseChatModel(x) {
825
+ const model = x;
826
+ return x != null && typeof x === "object" && typeof model._modelType === "function" && model._modelType() === "base_chat_model";
827
+ }
828
+ function appendFewShotExamples({ messages, fewShotExamples }) {
829
+ const lastUserMessageIdx = messages.slice().reverse().findIndex((msg) => msg.role === "user");
830
+ if (lastUserMessageIdx === -1) {
831
+ throw new Error("Appending few-shot examples requires a user message in the provided prompt");
832
+ }
833
+ const actualIdx = messages.length - 1 - lastUserMessageIdx;
834
+ messages[actualIdx].content += "\n\n" + fewShotExamples.map((example) => {
835
+ let exampleStr = `<example>
836
+ <input>${JSON.stringify(example.inputs)}</input>
837
+ <output>${JSON.stringify(example.outputs)}</output>`;
838
+ if (example.reasoning) {
839
+ exampleStr += `
840
+ <reasoning>${example.reasoning}</reasoning>`;
841
+ }
842
+ if (example.score !== void 0) {
843
+ exampleStr += `
844
+ <score>${example.score}</score>`;
845
+ }
846
+ exampleStr += "\n</example>";
847
+ return exampleStr;
848
+ }).join("\n");
849
+ return messages;
850
+ }
851
+ function constructDefaultOutputJsonSchema({ continuous, choices, useReasoning }) {
852
+ const jsonSchema = {
853
+ type: "object",
854
+ additionalProperties: false
855
+ };
856
+ let description;
857
+ let scoreSchema;
858
+ if (choices) {
859
+ description = "A number that represents the degree to which the criteria in the prompt are met.";
860
+ scoreSchema = {
861
+ type: "number",
862
+ description,
863
+ enum: choices
864
+ };
865
+ } else if (continuous) {
866
+ description = "A number that represents the degree to which the criteria in the prompt are met, from 0.0 to 1.0. 1.0 means the criteria are met perfectly. 0.0 means none of the criteria are met, 0.5 means exactly half of the criteria are met.";
867
+ scoreSchema = {
868
+ type: "number",
869
+ description
870
+ };
871
+ } else {
872
+ description = "A score that is true if criteria in the prompt are met, and false otherwise.";
873
+ scoreSchema = {
874
+ type: "boolean",
875
+ description
876
+ };
877
+ }
878
+ if (useReasoning) {
879
+ jsonSchema.properties = {
880
+ reasoning: {
881
+ type: "string",
882
+ description: "A human-readable explanation of the score. You MUST end the reasoning with a sentence that says: Thus, the score should be: SCORE_YOU_ASSIGN."
883
+ },
884
+ score: scoreSchema
885
+ };
886
+ jsonSchema.required = ["reasoning", "score"];
887
+ } else {
888
+ jsonSchema.properties = {
889
+ score: scoreSchema
890
+ };
891
+ jsonSchema.required = ["score"];
892
+ }
893
+ return [jsonSchema, description];
894
+ }
895
+ function _stringifyPromptParam(param) {
896
+ if (typeof param === "string") {
897
+ return param;
898
+ } else if ((0, import_messages5.isBaseMessage)(param)) {
899
+ return JSON.stringify(_convertToOpenAIMessage(param));
900
+ } else if (typeof param === "object" && param !== null) {
901
+ if (Array.isArray(param)) {
902
+ return JSON.stringify(param.map((message) => (0, import_messages5.isBaseMessage)(message) ? _convertToOpenAIMessage(message) : message));
903
+ }
904
+ const objParam = param;
905
+ if ("messages" in objParam && Array.isArray(objParam.messages)) {
906
+ objParam.messages = objParam.messages.map((message) => (0, import_messages5.isBaseMessage)(message) ? _convertToOpenAIMessage(message) : message);
907
+ return JSON.stringify(objParam);
908
+ }
909
+ return JSON.stringify(param);
910
+ }
911
+ return JSON.stringify(param);
912
+ }
913
+ var _createLLMAsJudgeScorer = (params) => {
914
+ const { prompt, system, model, continuous, choices, fewShotExamples } = params;
915
+ let schema;
916
+ if (isZodSchema(params.schema)) {
917
+ schema = (0, import_json_schema.toJsonSchema)(params.schema);
918
+ } else {
919
+ schema = params.schema;
920
+ }
921
+ let judge = params.judge;
922
+ const useReasoning = params.useReasoning ?? true;
923
+ const getScore = async (params2) => {
924
+ const { inputs, outputs, referenceOutputs, ...rest } = params2;
925
+ if (system && typeof prompt !== "string") {
926
+ throw new Error("`system` is only supported when `prompt` is a string template");
927
+ }
928
+ let stringifiedInputs = inputs;
929
+ let stringifiedOutputs = outputs;
930
+ let stringifiedReferenceOutputs = referenceOutputs;
931
+ if (inputs) {
932
+ stringifiedInputs = _stringifyPromptParam(inputs);
933
+ }
934
+ if (outputs) {
935
+ stringifiedOutputs = _stringifyPromptParam(outputs);
936
+ }
937
+ if (referenceOutputs) {
938
+ stringifiedReferenceOutputs = _stringifyPromptParam(referenceOutputs);
939
+ }
940
+ const stringifiedRest = Object.fromEntries(Object.entries(rest).map(([key, value]) => [
941
+ key,
942
+ _stringifyPromptParam(value)
943
+ ]));
944
+ let messages = [];
945
+ const promptParams = {
946
+ inputs: stringifiedInputs,
947
+ outputs: stringifiedOutputs,
948
+ reference_outputs: stringifiedReferenceOutputs,
949
+ ...stringifiedRest
950
+ };
951
+ const filteredPromptParams = Object.fromEntries(Object.entries(promptParams).filter(([_, value]) => value !== void 0));
952
+ if (_isRunnableInterface(prompt)) {
953
+ const formattedPrompt = await prompt.invoke(filteredPromptParams);
954
+ messages = formattedPrompt.messages;
955
+ if (_isStructuredPrompt(prompt)) {
956
+ schema = prompt.schema;
957
+ }
958
+ } else if (typeof prompt === "string") {
959
+ const template = import_prompts.ChatPromptTemplate.fromTemplate(prompt);
960
+ const formattedPrompt = await template.invoke(filteredPromptParams);
961
+ messages = formattedPrompt.messages;
962
+ } else {
963
+ messages = await prompt({
964
+ inputs,
965
+ outputs,
966
+ reference_outputs: referenceOutputs,
967
+ ...rest
968
+ });
969
+ }
970
+ if (system) {
971
+ messages = [{ role: "system", content: system }, ...messages];
972
+ }
973
+ let normalizedMessages = _normalizeToOpenAIMessagesList(messages);
974
+ if (fewShotExamples) {
975
+ normalizedMessages = appendFewShotExamples({
976
+ messages: normalizedMessages,
977
+ fewShotExamples
978
+ });
979
+ }
980
+ const [defaultJsonSchema, description] = constructDefaultOutputJsonSchema({
981
+ continuous,
982
+ choices,
983
+ useReasoning
984
+ });
985
+ if (!judge) {
986
+ if (!model) {
987
+ throw new Error("`model` string is required (e.g. 'openai:o3-mini') when `judge` is not provided");
988
+ }
989
+ judge = await (0, import_universal.initChatModel)(model);
990
+ }
991
+ let response;
992
+ if (_isBaseChatModel(judge)) {
993
+ const judgeWithStructuredOutput = judge.withStructuredOutput(schema ?? {
994
+ title: "score",
995
+ description,
996
+ ...defaultJsonSchema
997
+ });
998
+ response = await judgeWithStructuredOutput.invoke(normalizedMessages);
999
+ if (schema === void 0) {
1000
+ if (useReasoning) {
1001
+ return [response.score, response.reasoning];
1002
+ }
1003
+ return response.score;
1004
+ } else {
1005
+ return response;
1006
+ }
1007
+ } else {
1008
+ if (!model) {
1009
+ throw new Error("`model` string is required (e.g. 'openai:o3-mini') when `judge` is an OpenAI client");
1010
+ }
1011
+ let openaiJsonSchema = schema ?? defaultJsonSchema;
1012
+ if (openaiJsonSchema.name === void 0) {
1013
+ openaiJsonSchema = {
1014
+ name: "score",
1015
+ strict: true,
1016
+ schema: openaiJsonSchema
1017
+ };
1018
+ }
1019
+ if (openaiJsonSchema.schema == null || typeof openaiJsonSchema.schema !== "object") {
1020
+ throw new Error("`ouputSchema` must be JSON schema or OpenAI structured output format when using an OpenAI client directly");
1021
+ }
1022
+ if (!("additionalProperties" in openaiJsonSchema.schema)) {
1023
+ openaiJsonSchema.schema.additionalProperties = false;
1024
+ }
1025
+ const params3 = {
1026
+ messages: normalizedMessages,
1027
+ model: model.startsWith("openai:") ? model.slice("openai:".length) : model,
1028
+ response_format: {
1029
+ type: "json_schema",
1030
+ json_schema: openaiJsonSchema
1031
+ }
1032
+ };
1033
+ const invokeLlm = (0, import_traceable2.traceable)(judge.chat.completions.create.bind(judge.chat.completions), {
1034
+ metadata: {
1035
+ ls_provider: "openai",
1036
+ ls_model_name: model,
1037
+ ls_model_type: "chat"
1038
+ },
1039
+ run_type: "llm",
1040
+ name: "OpenAI Chat Completion"
1041
+ });
1042
+ const response2 = await invokeLlm(params3);
1043
+ const parsed = JSON.parse(response2.choices[0].message.content);
1044
+ if (schema === void 0) {
1045
+ if (useReasoning) {
1046
+ return [parsed.score, parsed.reasoning];
1047
+ }
1048
+ return parsed.score;
1049
+ }
1050
+ return parsed;
1051
+ }
1052
+ };
1053
+ return getScore;
1054
+ };
1055
+ function createLLMAsJudge({ prompt, feedbackKey = "score", model, system, judge, continuous = false, choices, useReasoning = true, fewShotExamples, outputSchema }) {
1056
+ if (outputSchema !== void 0 && _isStructuredPrompt(prompt)) {
1057
+ throw new Error("You may not provide both an `outputSchema` parameter and a LangChain prompt with output schema.");
1058
+ }
1059
+ const scorer = _createLLMAsJudgeScorer({
1060
+ prompt,
1061
+ judge,
1062
+ model,
1063
+ system,
1064
+ continuous,
1065
+ choices,
1066
+ useReasoning,
1067
+ fewShotExamples,
1068
+ schema: outputSchema
1069
+ });
1070
+ const _wrappedEvaluator = async (inputs) => {
1071
+ const runName = feedbackKey !== "score" ? "llm_as_judge" : `llm_as_${feedbackKey}_judge`;
1072
+ return _runEvaluatorUntyped(runName, scorer, feedbackKey, inputs, void 0, outputSchema !== void 0 || _isStructuredPrompt(prompt));
1073
+ };
1074
+ return _wrappedEvaluator;
1075
+ }
1076
+
1077
+ // node_modules/openevals/dist/code/base.js
1078
+ var import_universal2 = require("langchain/chat_models/universal");
1079
+ var import_prompts2 = require("@langchain/core/prompts");
1080
+
1081
+ // node_modules/openevals/dist/simulators/multiturn.js
1082
+ var import_traceable4 = require("langsmith/traceable");
1083
+
1084
+ // node_modules/openevals/dist/simulators/prebuilts.js
1085
+ var import_universal3 = require("langchain/chat_models/universal");
1086
+
1087
+ // src/eval/evaluators/llm-judge.ts
1088
+ var RESPONSE_CRITERIA_PROMPT = `You are an expert evaluator.
1089
+ Assess the following AI response based on the given criteria.
1090
+
1091
+ <Criteria>
1092
+ {criteria}
1093
+ </Criteria>
1094
+
1095
+ <Response>
1096
+ {outputs}
1097
+ </Response>
1098
+
1099
+ Grade whether the response meets the criteria.`;
1100
+ function createLlmJudgeEvaluator(modelConfig, model, criteria) {
1101
+ const resolver = new LangchainModelResolver(modelConfig);
1102
+ const judge = resolver.resolve(model);
1103
+ const llmJudge2 = createLLMAsJudge({
1104
+ prompt: RESPONSE_CRITERIA_PROMPT,
1105
+ feedbackKey: "llm_judge",
1106
+ judge,
1107
+ useReasoning: true
1108
+ });
1109
+ return async ({ outputs }) => {
1110
+ const messages = outputs.messages || [];
1111
+ const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages6.AIMessage);
1112
+ if (!lastAiMessage) {
1113
+ return { key: "llm_judge", score: false, comment: "No AI message found in trajectory" };
1114
+ }
1115
+ const responseText = typeof lastAiMessage.content === "string" ? lastAiMessage.content : JSON.stringify(lastAiMessage.content);
1116
+ return llmJudge2({
1117
+ outputs: responseText,
1118
+ criteria
1119
+ });
1120
+ };
1121
+ }
1122
+
1123
+ // src/eval/evaluators/response-content.ts
1124
+ var import_messages7 = require("@langchain/core/messages");
579
1125
  function createResponseContentEvaluator() {
580
1126
  return async ({
581
1127
  outputs,
@@ -587,7 +1133,7 @@ function createResponseContentEvaluator() {
587
1133
  return { key: "response_content", score: true, comment: "No content assertions specified, skipping" };
588
1134
  }
589
1135
  const messages = outputs.messages || [];
590
- const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages4.AIMessage);
1136
+ const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages7.AIMessage);
591
1137
  if (!lastAiMessage) {
592
1138
  return { key: "response_content", score: false, comment: "No AI message found in trajectory" };
593
1139
  }
@@ -613,7 +1159,7 @@ function createResponseContentEvaluator() {
613
1159
  }
614
1160
 
615
1161
  // src/eval/evaluators/no-tool-calls.ts
616
- var import_messages5 = require("@langchain/core/messages");
1162
+ var import_messages8 = require("@langchain/core/messages");
617
1163
  function createNoToolCallsEvaluator() {
618
1164
  return async ({
619
1165
  outputs,
@@ -624,7 +1170,7 @@ function createNoToolCallsEvaluator() {
624
1170
  }
625
1171
  const messages = outputs.messages || [];
626
1172
  const exceptTools = referenceOutputs?.exceptTools ?? [];
627
- const toolCalls = messages.filter((m) => m instanceof import_messages5.AIMessage).flatMap((m) => m.tool_calls || []);
1173
+ const toolCalls = messages.filter((m) => m instanceof import_messages8.AIMessage).flatMap((m) => m.tool_calls || []);
628
1174
  const disallowedCalls = exceptTools.length > 0 ? toolCalls.filter((tc) => !exceptTools.includes(tc.name)) : toolCalls;
629
1175
  const passed = disallowedCalls.length === 0;
630
1176
  if (exceptTools.length > 0) {
@@ -643,7 +1189,7 @@ function createNoToolCallsEvaluator() {
643
1189
  }
644
1190
 
645
1191
  // src/eval/evaluators/any-tool-called.ts
646
- var import_messages6 = require("@langchain/core/messages");
1192
+ var import_messages9 = require("@langchain/core/messages");
647
1193
  function createAnyToolCalledEvaluator() {
648
1194
  return async ({
649
1195
  outputs,
@@ -654,7 +1200,7 @@ function createAnyToolCalledEvaluator() {
654
1200
  }
655
1201
  const expectedTools = referenceOutputs?.anyToolsExpected ?? [];
656
1202
  const messages = outputs.messages || [];
657
- const calledToolNames = messages.filter((m) => m instanceof import_messages6.AIMessage).flatMap((m) => m.tool_calls || []).map((tc) => tc.name);
1203
+ const calledToolNames = messages.filter((m) => m instanceof import_messages9.AIMessage).flatMap((m) => m.tool_calls || []).map((tc) => tc.name);
658
1204
  if (expectedTools.length === 0) {
659
1205
  const passed2 = calledToolNames.length > 0;
660
1206
  return {
@@ -673,6 +1219,82 @@ function createAnyToolCalledEvaluator() {
673
1219
  };
674
1220
  }
675
1221
 
1222
+ // src/eval/evaluators/tool-input.ts
1223
+ var import_messages10 = require("@langchain/core/messages");
1224
+ function createToolInputEvaluator() {
1225
+ return async ({
1226
+ outputs,
1227
+ referenceOutputs
1228
+ }) => {
1229
+ const expectations = referenceOutputs?.toolInputExpectations ?? [];
1230
+ if (expectations.length === 0) {
1231
+ return { key: "tool_input", score: true, comment: "No tool input expectations specified, skipping" };
1232
+ }
1233
+ const messages = outputs.messages || [];
1234
+ const allToolCalls = messages.filter((m) => m instanceof import_messages10.AIMessage).flatMap((m) => m.tool_calls || []);
1235
+ const results = [];
1236
+ for (const expectation of expectations) {
1237
+ const matchingCalls = allToolCalls.filter((tc) => tc.name === expectation.name);
1238
+ const subChecks = [];
1239
+ let passed = true;
1240
+ if (expectation.times !== void 0) {
1241
+ const countOk = matchingCalls.length >= expectation.times;
1242
+ if (!countOk) {
1243
+ passed = false;
1244
+ subChecks.push(
1245
+ `expected at least ${expectation.times} call(s), got ${matchingCalls.length}`
1246
+ );
1247
+ } else {
1248
+ subChecks.push(
1249
+ `call count OK (${matchingCalls.length} >= ${expectation.times})`
1250
+ );
1251
+ }
1252
+ }
1253
+ if (expectation.validate) {
1254
+ if (matchingCalls.length === 0) {
1255
+ passed = false;
1256
+ subChecks.push("was never called");
1257
+ } else {
1258
+ const anyValid = matchingCalls.some((tc) => {
1259
+ try {
1260
+ return expectation.validate(tc.args);
1261
+ } catch {
1262
+ return false;
1263
+ }
1264
+ });
1265
+ if (!anyValid) {
1266
+ passed = false;
1267
+ subChecks.push(
1268
+ `input validation failed for all ${matchingCalls.length} call(s)`
1269
+ );
1270
+ } else {
1271
+ subChecks.push("input validation passed");
1272
+ }
1273
+ }
1274
+ }
1275
+ if (expectation.times === void 0 && !expectation.validate) {
1276
+ if (matchingCalls.length === 0) {
1277
+ passed = false;
1278
+ subChecks.push("was never called");
1279
+ } else {
1280
+ subChecks.push(`called ${matchingCalls.length} time(s)`);
1281
+ }
1282
+ }
1283
+ results.push({
1284
+ name: expectation.name,
1285
+ passed,
1286
+ comment: `"${expectation.name}": ${subChecks.join(", ")}`
1287
+ });
1288
+ }
1289
+ const allPassed = results.every((r) => r.passed);
1290
+ return {
1291
+ key: "tool_input",
1292
+ score: allPassed,
1293
+ comment: results.map((r) => r.comment).join("; ")
1294
+ };
1295
+ };
1296
+ }
1297
+
676
1298
  // src/eval/expectations.ts
677
1299
  function withTrajectoryGuard(evaluator, key) {
678
1300
  return async ({ outputs, referenceOutputs }) => {
@@ -699,27 +1321,48 @@ function buildTrajectory(message, toolNames) {
699
1321
  return trajectory;
700
1322
  }
701
1323
  function toolsCalled(tools) {
702
- return (ctx) => ({
703
- evaluator: ls2.wrapEvaluator(
1324
+ const toolNames = tools.map((t) => typeof t === "string" ? t : t.name);
1325
+ const validators = tools.filter((t) => typeof t !== "string");
1326
+ return (ctx) => {
1327
+ const trajectoryEvaluator = ls2.wrapEvaluator(
704
1328
  withTrajectoryGuard(
705
1329
  (0, import_agentevals.createTrajectoryMatchEvaluator)({ trajectoryMatchMode: "superset", toolArgsMatchMode: "ignore" }),
706
1330
  "trajectory_match"
707
1331
  )
708
- ),
709
- referenceOutputs: { referenceTrajectory: buildTrajectory(ctx.message, tools) }
710
- });
1332
+ );
1333
+ if (validators.length === 0) {
1334
+ return {
1335
+ evaluator: trajectoryEvaluator,
1336
+ referenceOutputs: { referenceTrajectory: buildTrajectory(ctx.message, toolNames) }
1337
+ };
1338
+ }
1339
+ const inputEvaluator = ls2.wrapEvaluator(createToolInputEvaluator());
1340
+ const composedEvaluator = async ({ outputs, referenceOutputs }) => {
1341
+ const trajectoryResult = await trajectoryEvaluator({ outputs, referenceOutputs });
1342
+ const inputResult = await inputEvaluator({
1343
+ outputs,
1344
+ referenceOutputs: { ...referenceOutputs, toolInputExpectations: validators }
1345
+ });
1346
+ const trajectoryPassed = Boolean(trajectoryResult.score);
1347
+ const inputPassed = Boolean(inputResult.score);
1348
+ return {
1349
+ key: "tools_called",
1350
+ score: trajectoryPassed && inputPassed,
1351
+ comment: [trajectoryResult.comment, inputResult.comment].filter(Boolean).join("; ")
1352
+ };
1353
+ };
1354
+ return {
1355
+ evaluator: composedEvaluator,
1356
+ referenceOutputs: { referenceTrajectory: buildTrajectory(ctx.message, toolNames) }
1357
+ };
1358
+ };
711
1359
  }
712
- function llmJudge() {
1360
+ function llmJudge(criteria) {
713
1361
  return () => {
714
1362
  const config = getEvalConfig();
715
1363
  const model = config.evaluatorModel;
716
1364
  return {
717
- evaluator: ls2.wrapEvaluator(
718
- withTrajectoryGuard(
719
- (0, import_agentevals.createTrajectoryLLMAsJudge)({ prompt: import_agentevals.TRAJECTORY_ACCURACY_PROMPT, model }),
720
- "trajectory_llm_judge"
721
- )
722
- ),
1365
+ evaluator: ls2.wrapEvaluator(createLlmJudgeEvaluator(config.modelConfig, model, criteria)),
723
1366
  referenceOutputs: {}
724
1367
  };
725
1368
  };