@dvina/agents 0.14.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/eval/index.d.mts +58 -13
- package/dist/eval/index.d.ts +58 -13
- package/dist/eval/index.js +672 -29
- package/dist/eval/index.js.map +1 -1
- package/dist/eval/index.mjs +673 -30
- package/dist/eval/index.mjs.map +1 -1
- package/dist/index.d.mts +2 -2
- package/dist/index.d.ts +2 -2
- package/dist/index.js +75 -0
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +75 -0
- package/dist/index.mjs.map +1 -1
- package/dist/{model-resolver-DjKRXKtu.d.mts → model-resolver-DSJRvrqA.d.mts} +2 -5
- package/dist/{model-resolver-DjKRXKtu.d.ts → model-resolver-DSJRvrqA.d.ts} +2 -5
- package/package.json +1 -1
package/dist/eval/index.js
CHANGED
|
@@ -251,9 +251,7 @@ function convertToLangchainMessages(messages) {
|
|
|
251
251
|
var MAX_AGENT_LOOPS = 10;
|
|
252
252
|
function stripReasoningBlocks(message) {
|
|
253
253
|
if (!Array.isArray(message.content)) return message;
|
|
254
|
-
const filtered = message.content.filter(
|
|
255
|
-
(block) => block.type !== "reasoning" && block.type !== "thinking"
|
|
256
|
-
);
|
|
254
|
+
const filtered = message.content.filter((block) => block.type !== "reasoning" && block.type !== "thinking");
|
|
257
255
|
const newContent = filtered.length > 0 ? filtered : "";
|
|
258
256
|
return new import_messages.AIMessage({
|
|
259
257
|
content: newContent,
|
|
@@ -299,6 +297,10 @@ function createEvalTarget(modelConfig, modelString) {
|
|
|
299
297
|
messages.push(new import_messages.SystemMessage(inputs.systemPrompt));
|
|
300
298
|
}
|
|
301
299
|
messages.push(...convertToLangchainMessages(inputs.messages));
|
|
300
|
+
const stopTools = inputs.executionMode?.type === "stop-after-tool" ? inputs.executionMode.tools : [];
|
|
301
|
+
const stopCount = inputs.executionMode?.type === "stop-after-tool" ? inputs.executionMode.count ?? 1 : 1;
|
|
302
|
+
const singleTurn = inputs.executionMode?.type === "single-turn";
|
|
303
|
+
let cumulativeHits = 0;
|
|
302
304
|
let loopCount = 0;
|
|
303
305
|
while (loopCount < MAX_AGENT_LOOPS) {
|
|
304
306
|
loopCount++;
|
|
@@ -308,6 +310,7 @@ function createEvalTarget(modelConfig, modelString) {
|
|
|
308
310
|
if (!aiMessage.tool_calls || aiMessage.tool_calls.length === 0) {
|
|
309
311
|
break;
|
|
310
312
|
}
|
|
313
|
+
let shouldStop = false;
|
|
311
314
|
for (const tc of aiMessage.tool_calls) {
|
|
312
315
|
const mockTool = langchainTools.find((t) => t.name === tc.name);
|
|
313
316
|
if (mockTool) {
|
|
@@ -328,6 +331,15 @@ function createEvalTarget(modelConfig, modelString) {
|
|
|
328
331
|
})
|
|
329
332
|
);
|
|
330
333
|
}
|
|
334
|
+
if (stopTools.includes(tc.name)) {
|
|
335
|
+
cumulativeHits++;
|
|
336
|
+
if (cumulativeHits >= stopCount) {
|
|
337
|
+
shouldStop = true;
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
if (singleTurn || shouldStop) {
|
|
342
|
+
break;
|
|
331
343
|
}
|
|
332
344
|
}
|
|
333
345
|
return { messages };
|
|
@@ -403,14 +415,94 @@ function toolDefsToDefinitions(defs) {
|
|
|
403
415
|
};
|
|
404
416
|
});
|
|
405
417
|
}
|
|
406
|
-
|
|
418
|
+
function wrapToolDefsForExecution(defs, tracker, abortController, executionMode) {
|
|
419
|
+
const stopTools = executionMode.type === "stop-after-tool" ? executionMode.tools : [];
|
|
420
|
+
const stopCount = executionMode.type === "stop-after-tool" ? executionMode.count ?? 1 : 1;
|
|
421
|
+
let cumulativeHits = 0;
|
|
422
|
+
return defs.map((def) => ({
|
|
423
|
+
...def,
|
|
424
|
+
exec: async (input) => {
|
|
425
|
+
const result = await def.exec(input);
|
|
426
|
+
const output = typeof result === "string" ? result : JSON.stringify(result);
|
|
427
|
+
tracker.push({
|
|
428
|
+
name: def.name,
|
|
429
|
+
input,
|
|
430
|
+
output,
|
|
431
|
+
toolCallId: `eval_tc_${tracker.length}`
|
|
432
|
+
});
|
|
433
|
+
if (executionMode.type === "single-turn") {
|
|
434
|
+
abortController.abort();
|
|
435
|
+
} else if (stopTools.includes(def.name)) {
|
|
436
|
+
cumulativeHits++;
|
|
437
|
+
if (cumulativeHits >= stopCount) {
|
|
438
|
+
abortController.abort();
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
return result;
|
|
442
|
+
}
|
|
443
|
+
}));
|
|
444
|
+
}
|
|
445
|
+
function buildTrajectoryFromTrackedCalls(inputMessages, trackedCalls) {
|
|
446
|
+
const messages = convertToLangchainMessages(inputMessages);
|
|
447
|
+
if (trackedCalls.length > 0) {
|
|
448
|
+
messages.push(
|
|
449
|
+
new import_messages.AIMessage({
|
|
450
|
+
content: "",
|
|
451
|
+
tool_calls: trackedCalls.map((tc) => ({
|
|
452
|
+
id: tc.toolCallId,
|
|
453
|
+
name: tc.name,
|
|
454
|
+
args: tc.input
|
|
455
|
+
}))
|
|
456
|
+
})
|
|
457
|
+
);
|
|
458
|
+
for (const tc of trackedCalls) {
|
|
459
|
+
messages.push(
|
|
460
|
+
new import_messages.ToolMessage({
|
|
461
|
+
content: tc.output,
|
|
462
|
+
tool_call_id: tc.toolCallId,
|
|
463
|
+
name: tc.name
|
|
464
|
+
})
|
|
465
|
+
);
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
return messages;
|
|
469
|
+
}
|
|
470
|
+
async function runAgentTarget(createTarget, model, evalMessages, extraToolDefs, executionMode) {
|
|
407
471
|
const extraTools = Object.keys(extraToolDefs).length > 0 ? toolDefsToDefinitions(extraToolDefs) : [];
|
|
408
|
-
const
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
}
|
|
413
|
-
|
|
472
|
+
const tracker = [];
|
|
473
|
+
let abortController;
|
|
474
|
+
if (executionMode) {
|
|
475
|
+
abortController = new AbortController();
|
|
476
|
+
}
|
|
477
|
+
const wrapTools = executionMode && abortController ? (tools) => wrapToolDefsForExecution(tools, tracker, abortController, executionMode) : void 0;
|
|
478
|
+
const agent = await createTarget(model, extraTools, wrapTools);
|
|
479
|
+
const signal = abortController?.signal;
|
|
480
|
+
try {
|
|
481
|
+
const agentPromise = agent.run({
|
|
482
|
+
threadId: `eval_${Date.now()}_${Math.random().toString(36).slice(2)}`,
|
|
483
|
+
messages: evalMessages,
|
|
484
|
+
signal
|
|
485
|
+
});
|
|
486
|
+
if (abortController) {
|
|
487
|
+
const abortPromise = new Promise((_, reject) => {
|
|
488
|
+
const onAbort = () => reject(new DOMException("Eval execution aborted", "AbortError"));
|
|
489
|
+
if (signal.aborted) {
|
|
490
|
+
onAbort();
|
|
491
|
+
return;
|
|
492
|
+
}
|
|
493
|
+
signal.addEventListener("abort", onAbort, { once: true });
|
|
494
|
+
});
|
|
495
|
+
const result2 = await Promise.race([agentPromise, abortPromise]);
|
|
496
|
+
return { messages: agentResultToMessages(evalMessages, result2) };
|
|
497
|
+
}
|
|
498
|
+
const result = await agentPromise;
|
|
499
|
+
return { messages: agentResultToMessages(evalMessages, result) };
|
|
500
|
+
} catch (error) {
|
|
501
|
+
if (error.name === "AbortError" || signal?.aborted) {
|
|
502
|
+
return { messages: buildTrajectoryFromTrackedCalls(evalMessages, tracker) };
|
|
503
|
+
}
|
|
504
|
+
throw error;
|
|
505
|
+
}
|
|
414
506
|
}
|
|
415
507
|
|
|
416
508
|
// src/eval/suite.ts
|
|
@@ -504,7 +596,8 @@ function runEvals() {
|
|
|
504
596
|
createTarget,
|
|
505
597
|
currentModel,
|
|
506
598
|
preparedMessages,
|
|
507
|
-
caseToolDefs
|
|
599
|
+
caseToolDefs,
|
|
600
|
+
tc.executionMode
|
|
508
601
|
);
|
|
509
602
|
} else {
|
|
510
603
|
const target = resolveModelTarget(config, currentModel);
|
|
@@ -513,6 +606,7 @@ function runEvals() {
|
|
|
513
606
|
output = await target({
|
|
514
607
|
messages: preparedMessages,
|
|
515
608
|
tools,
|
|
609
|
+
executionMode: tc.executionMode,
|
|
516
610
|
...systemPrompt ? { systemPrompt } : {}
|
|
517
611
|
});
|
|
518
612
|
}
|
|
@@ -574,8 +668,460 @@ function createLanguageEvaluator(modelConfig, model) {
|
|
|
574
668
|
};
|
|
575
669
|
}
|
|
576
670
|
|
|
577
|
-
// src/eval/evaluators/
|
|
671
|
+
// src/eval/evaluators/llm-judge.ts
|
|
672
|
+
var import_messages6 = require("@langchain/core/messages");
|
|
673
|
+
|
|
674
|
+
// node_modules/openevals/dist/utils.js
|
|
578
675
|
var import_messages4 = require("@langchain/core/messages");
|
|
676
|
+
var openAIImports = __toESM(require("@langchain/openai"), 1);
|
|
677
|
+
var import_jestlike = require("langsmith/utils/jestlike");
|
|
678
|
+
var import_traceable = require("langsmith/traceable");
|
|
679
|
+
var {
|
|
680
|
+
// @ts-expect-error Shim for older versions of @langchain/openai
|
|
681
|
+
_convertMessagesToOpenAIParams,
|
|
682
|
+
convertMessagesToCompletionsMessageParams
|
|
683
|
+
} = openAIImports;
|
|
684
|
+
function _convertMessagesShim(message) {
|
|
685
|
+
if (typeof _convertMessagesToOpenAIParams === "function") {
|
|
686
|
+
return _convertMessagesToOpenAIParams([
|
|
687
|
+
message
|
|
688
|
+
])[0];
|
|
689
|
+
}
|
|
690
|
+
return convertMessagesToCompletionsMessageParams({
|
|
691
|
+
messages: [message]
|
|
692
|
+
})[0];
|
|
693
|
+
}
|
|
694
|
+
var _convertToOpenAIMessage = (message) => {
|
|
695
|
+
if ((0, import_messages4.isBaseMessage)(message)) {
|
|
696
|
+
const converted = _convertMessagesShim(message);
|
|
697
|
+
if (message.id && !converted.id) {
|
|
698
|
+
converted.id = message.id;
|
|
699
|
+
}
|
|
700
|
+
return converted;
|
|
701
|
+
} else {
|
|
702
|
+
return message;
|
|
703
|
+
}
|
|
704
|
+
};
|
|
705
|
+
var _normalizeToOpenAIMessagesList = (messages) => {
|
|
706
|
+
let messagesList;
|
|
707
|
+
if (!Array.isArray(messages)) {
|
|
708
|
+
if ("messages" in messages && Array.isArray(messages.messages)) {
|
|
709
|
+
messagesList = messages.messages;
|
|
710
|
+
} else if ("content" in messages && "role" in messages) {
|
|
711
|
+
messagesList = [messages];
|
|
712
|
+
} else {
|
|
713
|
+
throw new Error(`If passing messages as an object, it must contain a "messages" key`);
|
|
714
|
+
}
|
|
715
|
+
} else {
|
|
716
|
+
messagesList = messages;
|
|
717
|
+
}
|
|
718
|
+
return messagesList.map(_convertToOpenAIMessage);
|
|
719
|
+
};
|
|
720
|
+
var processScore = (_, value) => {
|
|
721
|
+
if (typeof value === "object") {
|
|
722
|
+
if (value != null && "score" in value) {
|
|
723
|
+
return [
|
|
724
|
+
value.score,
|
|
725
|
+
"reasoning" in value && typeof value.reasoning === "string" ? value.reasoning : void 0,
|
|
726
|
+
value.metadata,
|
|
727
|
+
value.sourceRunId
|
|
728
|
+
];
|
|
729
|
+
} else {
|
|
730
|
+
throw new Error(`Expected a dictionary with a "score" key, but got "${JSON.stringify(value, null, 2)}"`);
|
|
731
|
+
}
|
|
732
|
+
}
|
|
733
|
+
return [value];
|
|
734
|
+
};
|
|
735
|
+
async function _runEvaluatorUntyped(runName, scorer, feedbackKey, extra, ls_framework, returnRawOutputs) {
|
|
736
|
+
const runScorer = async (params) => {
|
|
737
|
+
let score = await scorer(params);
|
|
738
|
+
if (returnRawOutputs) {
|
|
739
|
+
return score;
|
|
740
|
+
}
|
|
741
|
+
let reasoning;
|
|
742
|
+
if (!Array.isArray(score) && typeof score === "object") {
|
|
743
|
+
const results = [];
|
|
744
|
+
for (const [key, value] of Object.entries(score)) {
|
|
745
|
+
const [keyScore, reasoning2, metadata, sourceRunId] = processScore(
|
|
746
|
+
key,
|
|
747
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
748
|
+
value
|
|
749
|
+
);
|
|
750
|
+
const result = {
|
|
751
|
+
key,
|
|
752
|
+
score: keyScore,
|
|
753
|
+
comment: reasoning2,
|
|
754
|
+
metadata
|
|
755
|
+
};
|
|
756
|
+
if (sourceRunId !== void 0 && typeof sourceRunId === "string") {
|
|
757
|
+
result.sourceRunId = sourceRunId;
|
|
758
|
+
}
|
|
759
|
+
results.push(result);
|
|
760
|
+
}
|
|
761
|
+
return results;
|
|
762
|
+
} else {
|
|
763
|
+
let metadata;
|
|
764
|
+
if (Array.isArray(score)) {
|
|
765
|
+
metadata = score[2];
|
|
766
|
+
reasoning = score[1];
|
|
767
|
+
score = score[0];
|
|
768
|
+
}
|
|
769
|
+
return {
|
|
770
|
+
key: feedbackKey,
|
|
771
|
+
score,
|
|
772
|
+
comment: reasoning,
|
|
773
|
+
metadata
|
|
774
|
+
};
|
|
775
|
+
}
|
|
776
|
+
};
|
|
777
|
+
if ((0, import_jestlike.isInTestContext)()) {
|
|
778
|
+
const res = await (0, import_jestlike.wrapEvaluator)(runScorer)(extra ?? {}, {
|
|
779
|
+
name: runName,
|
|
780
|
+
metadata: {
|
|
781
|
+
__ls_framework: ls_framework ?? "openevals",
|
|
782
|
+
__ls_evaluator: runName,
|
|
783
|
+
__ls_language: "js"
|
|
784
|
+
}
|
|
785
|
+
});
|
|
786
|
+
if (returnRawOutputs) {
|
|
787
|
+
const rawResults = res;
|
|
788
|
+
return rawResults;
|
|
789
|
+
}
|
|
790
|
+
return res;
|
|
791
|
+
} else {
|
|
792
|
+
const traceableRunScorer = (0, import_traceable.traceable)(runScorer, {
|
|
793
|
+
name: runName,
|
|
794
|
+
metadata: {
|
|
795
|
+
__ls_framework: ls_framework ?? "openevals",
|
|
796
|
+
__ls_evaluator: runName,
|
|
797
|
+
__ls_language: "js"
|
|
798
|
+
}
|
|
799
|
+
});
|
|
800
|
+
const res = await traceableRunScorer(extra ?? {});
|
|
801
|
+
return res;
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
// node_modules/openevals/dist/json/match.js
|
|
806
|
+
var import_traceable3 = require("langsmith/traceable");
|
|
807
|
+
|
|
808
|
+
// node_modules/openevals/dist/llm.js
|
|
809
|
+
var import_runnables = require("@langchain/core/runnables");
|
|
810
|
+
var import_prompts = require("@langchain/core/prompts");
|
|
811
|
+
var import_messages5 = require("@langchain/core/messages");
|
|
812
|
+
var import_json_schema = require("@langchain/core/utils/json_schema");
|
|
813
|
+
var import_universal = require("langchain/chat_models/universal");
|
|
814
|
+
var import_traceable2 = require("langsmith/traceable");
|
|
815
|
+
function _isRunnableInterface(prompt) {
|
|
816
|
+
return import_runnables.Runnable.isRunnable(prompt);
|
|
817
|
+
}
|
|
818
|
+
function _isStructuredPrompt(prompt) {
|
|
819
|
+
return _isRunnableInterface(prompt) && "schema" in prompt && prompt.schema != null;
|
|
820
|
+
}
|
|
821
|
+
function isZodSchema(input) {
|
|
822
|
+
return typeof input?.parse === "function";
|
|
823
|
+
}
|
|
824
|
+
function _isBaseChatModel(x) {
|
|
825
|
+
const model = x;
|
|
826
|
+
return x != null && typeof x === "object" && typeof model._modelType === "function" && model._modelType() === "base_chat_model";
|
|
827
|
+
}
|
|
828
|
+
function appendFewShotExamples({ messages, fewShotExamples }) {
|
|
829
|
+
const lastUserMessageIdx = messages.slice().reverse().findIndex((msg) => msg.role === "user");
|
|
830
|
+
if (lastUserMessageIdx === -1) {
|
|
831
|
+
throw new Error("Appending few-shot examples requires a user message in the provided prompt");
|
|
832
|
+
}
|
|
833
|
+
const actualIdx = messages.length - 1 - lastUserMessageIdx;
|
|
834
|
+
messages[actualIdx].content += "\n\n" + fewShotExamples.map((example) => {
|
|
835
|
+
let exampleStr = `<example>
|
|
836
|
+
<input>${JSON.stringify(example.inputs)}</input>
|
|
837
|
+
<output>${JSON.stringify(example.outputs)}</output>`;
|
|
838
|
+
if (example.reasoning) {
|
|
839
|
+
exampleStr += `
|
|
840
|
+
<reasoning>${example.reasoning}</reasoning>`;
|
|
841
|
+
}
|
|
842
|
+
if (example.score !== void 0) {
|
|
843
|
+
exampleStr += `
|
|
844
|
+
<score>${example.score}</score>`;
|
|
845
|
+
}
|
|
846
|
+
exampleStr += "\n</example>";
|
|
847
|
+
return exampleStr;
|
|
848
|
+
}).join("\n");
|
|
849
|
+
return messages;
|
|
850
|
+
}
|
|
851
|
+
function constructDefaultOutputJsonSchema({ continuous, choices, useReasoning }) {
|
|
852
|
+
const jsonSchema = {
|
|
853
|
+
type: "object",
|
|
854
|
+
additionalProperties: false
|
|
855
|
+
};
|
|
856
|
+
let description;
|
|
857
|
+
let scoreSchema;
|
|
858
|
+
if (choices) {
|
|
859
|
+
description = "A number that represents the degree to which the criteria in the prompt are met.";
|
|
860
|
+
scoreSchema = {
|
|
861
|
+
type: "number",
|
|
862
|
+
description,
|
|
863
|
+
enum: choices
|
|
864
|
+
};
|
|
865
|
+
} else if (continuous) {
|
|
866
|
+
description = "A number that represents the degree to which the criteria in the prompt are met, from 0.0 to 1.0. 1.0 means the criteria are met perfectly. 0.0 means none of the criteria are met, 0.5 means exactly half of the criteria are met.";
|
|
867
|
+
scoreSchema = {
|
|
868
|
+
type: "number",
|
|
869
|
+
description
|
|
870
|
+
};
|
|
871
|
+
} else {
|
|
872
|
+
description = "A score that is true if criteria in the prompt are met, and false otherwise.";
|
|
873
|
+
scoreSchema = {
|
|
874
|
+
type: "boolean",
|
|
875
|
+
description
|
|
876
|
+
};
|
|
877
|
+
}
|
|
878
|
+
if (useReasoning) {
|
|
879
|
+
jsonSchema.properties = {
|
|
880
|
+
reasoning: {
|
|
881
|
+
type: "string",
|
|
882
|
+
description: "A human-readable explanation of the score. You MUST end the reasoning with a sentence that says: Thus, the score should be: SCORE_YOU_ASSIGN."
|
|
883
|
+
},
|
|
884
|
+
score: scoreSchema
|
|
885
|
+
};
|
|
886
|
+
jsonSchema.required = ["reasoning", "score"];
|
|
887
|
+
} else {
|
|
888
|
+
jsonSchema.properties = {
|
|
889
|
+
score: scoreSchema
|
|
890
|
+
};
|
|
891
|
+
jsonSchema.required = ["score"];
|
|
892
|
+
}
|
|
893
|
+
return [jsonSchema, description];
|
|
894
|
+
}
|
|
895
|
+
function _stringifyPromptParam(param) {
|
|
896
|
+
if (typeof param === "string") {
|
|
897
|
+
return param;
|
|
898
|
+
} else if ((0, import_messages5.isBaseMessage)(param)) {
|
|
899
|
+
return JSON.stringify(_convertToOpenAIMessage(param));
|
|
900
|
+
} else if (typeof param === "object" && param !== null) {
|
|
901
|
+
if (Array.isArray(param)) {
|
|
902
|
+
return JSON.stringify(param.map((message) => (0, import_messages5.isBaseMessage)(message) ? _convertToOpenAIMessage(message) : message));
|
|
903
|
+
}
|
|
904
|
+
const objParam = param;
|
|
905
|
+
if ("messages" in objParam && Array.isArray(objParam.messages)) {
|
|
906
|
+
objParam.messages = objParam.messages.map((message) => (0, import_messages5.isBaseMessage)(message) ? _convertToOpenAIMessage(message) : message);
|
|
907
|
+
return JSON.stringify(objParam);
|
|
908
|
+
}
|
|
909
|
+
return JSON.stringify(param);
|
|
910
|
+
}
|
|
911
|
+
return JSON.stringify(param);
|
|
912
|
+
}
|
|
913
|
+
var _createLLMAsJudgeScorer = (params) => {
|
|
914
|
+
const { prompt, system, model, continuous, choices, fewShotExamples } = params;
|
|
915
|
+
let schema;
|
|
916
|
+
if (isZodSchema(params.schema)) {
|
|
917
|
+
schema = (0, import_json_schema.toJsonSchema)(params.schema);
|
|
918
|
+
} else {
|
|
919
|
+
schema = params.schema;
|
|
920
|
+
}
|
|
921
|
+
let judge = params.judge;
|
|
922
|
+
const useReasoning = params.useReasoning ?? true;
|
|
923
|
+
const getScore = async (params2) => {
|
|
924
|
+
const { inputs, outputs, referenceOutputs, ...rest } = params2;
|
|
925
|
+
if (system && typeof prompt !== "string") {
|
|
926
|
+
throw new Error("`system` is only supported when `prompt` is a string template");
|
|
927
|
+
}
|
|
928
|
+
let stringifiedInputs = inputs;
|
|
929
|
+
let stringifiedOutputs = outputs;
|
|
930
|
+
let stringifiedReferenceOutputs = referenceOutputs;
|
|
931
|
+
if (inputs) {
|
|
932
|
+
stringifiedInputs = _stringifyPromptParam(inputs);
|
|
933
|
+
}
|
|
934
|
+
if (outputs) {
|
|
935
|
+
stringifiedOutputs = _stringifyPromptParam(outputs);
|
|
936
|
+
}
|
|
937
|
+
if (referenceOutputs) {
|
|
938
|
+
stringifiedReferenceOutputs = _stringifyPromptParam(referenceOutputs);
|
|
939
|
+
}
|
|
940
|
+
const stringifiedRest = Object.fromEntries(Object.entries(rest).map(([key, value]) => [
|
|
941
|
+
key,
|
|
942
|
+
_stringifyPromptParam(value)
|
|
943
|
+
]));
|
|
944
|
+
let messages = [];
|
|
945
|
+
const promptParams = {
|
|
946
|
+
inputs: stringifiedInputs,
|
|
947
|
+
outputs: stringifiedOutputs,
|
|
948
|
+
reference_outputs: stringifiedReferenceOutputs,
|
|
949
|
+
...stringifiedRest
|
|
950
|
+
};
|
|
951
|
+
const filteredPromptParams = Object.fromEntries(Object.entries(promptParams).filter(([_, value]) => value !== void 0));
|
|
952
|
+
if (_isRunnableInterface(prompt)) {
|
|
953
|
+
const formattedPrompt = await prompt.invoke(filteredPromptParams);
|
|
954
|
+
messages = formattedPrompt.messages;
|
|
955
|
+
if (_isStructuredPrompt(prompt)) {
|
|
956
|
+
schema = prompt.schema;
|
|
957
|
+
}
|
|
958
|
+
} else if (typeof prompt === "string") {
|
|
959
|
+
const template = import_prompts.ChatPromptTemplate.fromTemplate(prompt);
|
|
960
|
+
const formattedPrompt = await template.invoke(filteredPromptParams);
|
|
961
|
+
messages = formattedPrompt.messages;
|
|
962
|
+
} else {
|
|
963
|
+
messages = await prompt({
|
|
964
|
+
inputs,
|
|
965
|
+
outputs,
|
|
966
|
+
reference_outputs: referenceOutputs,
|
|
967
|
+
...rest
|
|
968
|
+
});
|
|
969
|
+
}
|
|
970
|
+
if (system) {
|
|
971
|
+
messages = [{ role: "system", content: system }, ...messages];
|
|
972
|
+
}
|
|
973
|
+
let normalizedMessages = _normalizeToOpenAIMessagesList(messages);
|
|
974
|
+
if (fewShotExamples) {
|
|
975
|
+
normalizedMessages = appendFewShotExamples({
|
|
976
|
+
messages: normalizedMessages,
|
|
977
|
+
fewShotExamples
|
|
978
|
+
});
|
|
979
|
+
}
|
|
980
|
+
const [defaultJsonSchema, description] = constructDefaultOutputJsonSchema({
|
|
981
|
+
continuous,
|
|
982
|
+
choices,
|
|
983
|
+
useReasoning
|
|
984
|
+
});
|
|
985
|
+
if (!judge) {
|
|
986
|
+
if (!model) {
|
|
987
|
+
throw new Error("`model` string is required (e.g. 'openai:o3-mini') when `judge` is not provided");
|
|
988
|
+
}
|
|
989
|
+
judge = await (0, import_universal.initChatModel)(model);
|
|
990
|
+
}
|
|
991
|
+
let response;
|
|
992
|
+
if (_isBaseChatModel(judge)) {
|
|
993
|
+
const judgeWithStructuredOutput = judge.withStructuredOutput(schema ?? {
|
|
994
|
+
title: "score",
|
|
995
|
+
description,
|
|
996
|
+
...defaultJsonSchema
|
|
997
|
+
});
|
|
998
|
+
response = await judgeWithStructuredOutput.invoke(normalizedMessages);
|
|
999
|
+
if (schema === void 0) {
|
|
1000
|
+
if (useReasoning) {
|
|
1001
|
+
return [response.score, response.reasoning];
|
|
1002
|
+
}
|
|
1003
|
+
return response.score;
|
|
1004
|
+
} else {
|
|
1005
|
+
return response;
|
|
1006
|
+
}
|
|
1007
|
+
} else {
|
|
1008
|
+
if (!model) {
|
|
1009
|
+
throw new Error("`model` string is required (e.g. 'openai:o3-mini') when `judge` is an OpenAI client");
|
|
1010
|
+
}
|
|
1011
|
+
let openaiJsonSchema = schema ?? defaultJsonSchema;
|
|
1012
|
+
if (openaiJsonSchema.name === void 0) {
|
|
1013
|
+
openaiJsonSchema = {
|
|
1014
|
+
name: "score",
|
|
1015
|
+
strict: true,
|
|
1016
|
+
schema: openaiJsonSchema
|
|
1017
|
+
};
|
|
1018
|
+
}
|
|
1019
|
+
if (openaiJsonSchema.schema == null || typeof openaiJsonSchema.schema !== "object") {
|
|
1020
|
+
throw new Error("`ouputSchema` must be JSON schema or OpenAI structured output format when using an OpenAI client directly");
|
|
1021
|
+
}
|
|
1022
|
+
if (!("additionalProperties" in openaiJsonSchema.schema)) {
|
|
1023
|
+
openaiJsonSchema.schema.additionalProperties = false;
|
|
1024
|
+
}
|
|
1025
|
+
const params3 = {
|
|
1026
|
+
messages: normalizedMessages,
|
|
1027
|
+
model: model.startsWith("openai:") ? model.slice("openai:".length) : model,
|
|
1028
|
+
response_format: {
|
|
1029
|
+
type: "json_schema",
|
|
1030
|
+
json_schema: openaiJsonSchema
|
|
1031
|
+
}
|
|
1032
|
+
};
|
|
1033
|
+
const invokeLlm = (0, import_traceable2.traceable)(judge.chat.completions.create.bind(judge.chat.completions), {
|
|
1034
|
+
metadata: {
|
|
1035
|
+
ls_provider: "openai",
|
|
1036
|
+
ls_model_name: model,
|
|
1037
|
+
ls_model_type: "chat"
|
|
1038
|
+
},
|
|
1039
|
+
run_type: "llm",
|
|
1040
|
+
name: "OpenAI Chat Completion"
|
|
1041
|
+
});
|
|
1042
|
+
const response2 = await invokeLlm(params3);
|
|
1043
|
+
const parsed = JSON.parse(response2.choices[0].message.content);
|
|
1044
|
+
if (schema === void 0) {
|
|
1045
|
+
if (useReasoning) {
|
|
1046
|
+
return [parsed.score, parsed.reasoning];
|
|
1047
|
+
}
|
|
1048
|
+
return parsed.score;
|
|
1049
|
+
}
|
|
1050
|
+
return parsed;
|
|
1051
|
+
}
|
|
1052
|
+
};
|
|
1053
|
+
return getScore;
|
|
1054
|
+
};
|
|
1055
|
+
function createLLMAsJudge({ prompt, feedbackKey = "score", model, system, judge, continuous = false, choices, useReasoning = true, fewShotExamples, outputSchema }) {
|
|
1056
|
+
if (outputSchema !== void 0 && _isStructuredPrompt(prompt)) {
|
|
1057
|
+
throw new Error("You may not provide both an `outputSchema` parameter and a LangChain prompt with output schema.");
|
|
1058
|
+
}
|
|
1059
|
+
const scorer = _createLLMAsJudgeScorer({
|
|
1060
|
+
prompt,
|
|
1061
|
+
judge,
|
|
1062
|
+
model,
|
|
1063
|
+
system,
|
|
1064
|
+
continuous,
|
|
1065
|
+
choices,
|
|
1066
|
+
useReasoning,
|
|
1067
|
+
fewShotExamples,
|
|
1068
|
+
schema: outputSchema
|
|
1069
|
+
});
|
|
1070
|
+
const _wrappedEvaluator = async (inputs) => {
|
|
1071
|
+
const runName = feedbackKey !== "score" ? "llm_as_judge" : `llm_as_${feedbackKey}_judge`;
|
|
1072
|
+
return _runEvaluatorUntyped(runName, scorer, feedbackKey, inputs, void 0, outputSchema !== void 0 || _isStructuredPrompt(prompt));
|
|
1073
|
+
};
|
|
1074
|
+
return _wrappedEvaluator;
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
// node_modules/openevals/dist/code/base.js
|
|
1078
|
+
var import_universal2 = require("langchain/chat_models/universal");
|
|
1079
|
+
var import_prompts2 = require("@langchain/core/prompts");
|
|
1080
|
+
|
|
1081
|
+
// node_modules/openevals/dist/simulators/multiturn.js
|
|
1082
|
+
var import_traceable4 = require("langsmith/traceable");
|
|
1083
|
+
|
|
1084
|
+
// node_modules/openevals/dist/simulators/prebuilts.js
|
|
1085
|
+
var import_universal3 = require("langchain/chat_models/universal");
|
|
1086
|
+
|
|
1087
|
+
// src/eval/evaluators/llm-judge.ts
|
|
1088
|
+
var RESPONSE_CRITERIA_PROMPT = `You are an expert evaluator.
|
|
1089
|
+
Assess the following AI response based on the given criteria.
|
|
1090
|
+
|
|
1091
|
+
<Criteria>
|
|
1092
|
+
{criteria}
|
|
1093
|
+
</Criteria>
|
|
1094
|
+
|
|
1095
|
+
<Response>
|
|
1096
|
+
{outputs}
|
|
1097
|
+
</Response>
|
|
1098
|
+
|
|
1099
|
+
Grade whether the response meets the criteria.`;
|
|
1100
|
+
function createLlmJudgeEvaluator(modelConfig, model, criteria) {
|
|
1101
|
+
const resolver = new LangchainModelResolver(modelConfig);
|
|
1102
|
+
const judge = resolver.resolve(model);
|
|
1103
|
+
const llmJudge2 = createLLMAsJudge({
|
|
1104
|
+
prompt: RESPONSE_CRITERIA_PROMPT,
|
|
1105
|
+
feedbackKey: "llm_judge",
|
|
1106
|
+
judge,
|
|
1107
|
+
useReasoning: true
|
|
1108
|
+
});
|
|
1109
|
+
return async ({ outputs }) => {
|
|
1110
|
+
const messages = outputs.messages || [];
|
|
1111
|
+
const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages6.AIMessage);
|
|
1112
|
+
if (!lastAiMessage) {
|
|
1113
|
+
return { key: "llm_judge", score: false, comment: "No AI message found in trajectory" };
|
|
1114
|
+
}
|
|
1115
|
+
const responseText = typeof lastAiMessage.content === "string" ? lastAiMessage.content : JSON.stringify(lastAiMessage.content);
|
|
1116
|
+
return llmJudge2({
|
|
1117
|
+
outputs: responseText,
|
|
1118
|
+
criteria
|
|
1119
|
+
});
|
|
1120
|
+
};
|
|
1121
|
+
}
|
|
1122
|
+
|
|
1123
|
+
// src/eval/evaluators/response-content.ts
|
|
1124
|
+
var import_messages7 = require("@langchain/core/messages");
|
|
579
1125
|
function createResponseContentEvaluator() {
|
|
580
1126
|
return async ({
|
|
581
1127
|
outputs,
|
|
@@ -587,7 +1133,7 @@ function createResponseContentEvaluator() {
|
|
|
587
1133
|
return { key: "response_content", score: true, comment: "No content assertions specified, skipping" };
|
|
588
1134
|
}
|
|
589
1135
|
const messages = outputs.messages || [];
|
|
590
|
-
const lastAiMessage = [...messages].reverse().find((m) => m instanceof
|
|
1136
|
+
const lastAiMessage = [...messages].reverse().find((m) => m instanceof import_messages7.AIMessage);
|
|
591
1137
|
if (!lastAiMessage) {
|
|
592
1138
|
return { key: "response_content", score: false, comment: "No AI message found in trajectory" };
|
|
593
1139
|
}
|
|
@@ -613,7 +1159,7 @@ function createResponseContentEvaluator() {
|
|
|
613
1159
|
}
|
|
614
1160
|
|
|
615
1161
|
// src/eval/evaluators/no-tool-calls.ts
|
|
616
|
-
var
|
|
1162
|
+
var import_messages8 = require("@langchain/core/messages");
|
|
617
1163
|
function createNoToolCallsEvaluator() {
|
|
618
1164
|
return async ({
|
|
619
1165
|
outputs,
|
|
@@ -624,7 +1170,7 @@ function createNoToolCallsEvaluator() {
|
|
|
624
1170
|
}
|
|
625
1171
|
const messages = outputs.messages || [];
|
|
626
1172
|
const exceptTools = referenceOutputs?.exceptTools ?? [];
|
|
627
|
-
const toolCalls = messages.filter((m) => m instanceof
|
|
1173
|
+
const toolCalls = messages.filter((m) => m instanceof import_messages8.AIMessage).flatMap((m) => m.tool_calls || []);
|
|
628
1174
|
const disallowedCalls = exceptTools.length > 0 ? toolCalls.filter((tc) => !exceptTools.includes(tc.name)) : toolCalls;
|
|
629
1175
|
const passed = disallowedCalls.length === 0;
|
|
630
1176
|
if (exceptTools.length > 0) {
|
|
@@ -643,7 +1189,7 @@ function createNoToolCallsEvaluator() {
|
|
|
643
1189
|
}
|
|
644
1190
|
|
|
645
1191
|
// src/eval/evaluators/any-tool-called.ts
|
|
646
|
-
var
|
|
1192
|
+
var import_messages9 = require("@langchain/core/messages");
|
|
647
1193
|
function createAnyToolCalledEvaluator() {
|
|
648
1194
|
return async ({
|
|
649
1195
|
outputs,
|
|
@@ -654,7 +1200,7 @@ function createAnyToolCalledEvaluator() {
|
|
|
654
1200
|
}
|
|
655
1201
|
const expectedTools = referenceOutputs?.anyToolsExpected ?? [];
|
|
656
1202
|
const messages = outputs.messages || [];
|
|
657
|
-
const calledToolNames = messages.filter((m) => m instanceof
|
|
1203
|
+
const calledToolNames = messages.filter((m) => m instanceof import_messages9.AIMessage).flatMap((m) => m.tool_calls || []).map((tc) => tc.name);
|
|
658
1204
|
if (expectedTools.length === 0) {
|
|
659
1205
|
const passed2 = calledToolNames.length > 0;
|
|
660
1206
|
return {
|
|
@@ -673,6 +1219,82 @@ function createAnyToolCalledEvaluator() {
|
|
|
673
1219
|
};
|
|
674
1220
|
}
|
|
675
1221
|
|
|
1222
|
+
// src/eval/evaluators/tool-input.ts
|
|
1223
|
+
var import_messages10 = require("@langchain/core/messages");
|
|
1224
|
+
function createToolInputEvaluator() {
|
|
1225
|
+
return async ({
|
|
1226
|
+
outputs,
|
|
1227
|
+
referenceOutputs
|
|
1228
|
+
}) => {
|
|
1229
|
+
const expectations = referenceOutputs?.toolInputExpectations ?? [];
|
|
1230
|
+
if (expectations.length === 0) {
|
|
1231
|
+
return { key: "tool_input", score: true, comment: "No tool input expectations specified, skipping" };
|
|
1232
|
+
}
|
|
1233
|
+
const messages = outputs.messages || [];
|
|
1234
|
+
const allToolCalls = messages.filter((m) => m instanceof import_messages10.AIMessage).flatMap((m) => m.tool_calls || []);
|
|
1235
|
+
const results = [];
|
|
1236
|
+
for (const expectation of expectations) {
|
|
1237
|
+
const matchingCalls = allToolCalls.filter((tc) => tc.name === expectation.name);
|
|
1238
|
+
const subChecks = [];
|
|
1239
|
+
let passed = true;
|
|
1240
|
+
if (expectation.times !== void 0) {
|
|
1241
|
+
const countOk = matchingCalls.length >= expectation.times;
|
|
1242
|
+
if (!countOk) {
|
|
1243
|
+
passed = false;
|
|
1244
|
+
subChecks.push(
|
|
1245
|
+
`expected at least ${expectation.times} call(s), got ${matchingCalls.length}`
|
|
1246
|
+
);
|
|
1247
|
+
} else {
|
|
1248
|
+
subChecks.push(
|
|
1249
|
+
`call count OK (${matchingCalls.length} >= ${expectation.times})`
|
|
1250
|
+
);
|
|
1251
|
+
}
|
|
1252
|
+
}
|
|
1253
|
+
if (expectation.validate) {
|
|
1254
|
+
if (matchingCalls.length === 0) {
|
|
1255
|
+
passed = false;
|
|
1256
|
+
subChecks.push("was never called");
|
|
1257
|
+
} else {
|
|
1258
|
+
const anyValid = matchingCalls.some((tc) => {
|
|
1259
|
+
try {
|
|
1260
|
+
return expectation.validate(tc.args);
|
|
1261
|
+
} catch {
|
|
1262
|
+
return false;
|
|
1263
|
+
}
|
|
1264
|
+
});
|
|
1265
|
+
if (!anyValid) {
|
|
1266
|
+
passed = false;
|
|
1267
|
+
subChecks.push(
|
|
1268
|
+
`input validation failed for all ${matchingCalls.length} call(s)`
|
|
1269
|
+
);
|
|
1270
|
+
} else {
|
|
1271
|
+
subChecks.push("input validation passed");
|
|
1272
|
+
}
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
1275
|
+
if (expectation.times === void 0 && !expectation.validate) {
|
|
1276
|
+
if (matchingCalls.length === 0) {
|
|
1277
|
+
passed = false;
|
|
1278
|
+
subChecks.push("was never called");
|
|
1279
|
+
} else {
|
|
1280
|
+
subChecks.push(`called ${matchingCalls.length} time(s)`);
|
|
1281
|
+
}
|
|
1282
|
+
}
|
|
1283
|
+
results.push({
|
|
1284
|
+
name: expectation.name,
|
|
1285
|
+
passed,
|
|
1286
|
+
comment: `"${expectation.name}": ${subChecks.join(", ")}`
|
|
1287
|
+
});
|
|
1288
|
+
}
|
|
1289
|
+
const allPassed = results.every((r) => r.passed);
|
|
1290
|
+
return {
|
|
1291
|
+
key: "tool_input",
|
|
1292
|
+
score: allPassed,
|
|
1293
|
+
comment: results.map((r) => r.comment).join("; ")
|
|
1294
|
+
};
|
|
1295
|
+
};
|
|
1296
|
+
}
|
|
1297
|
+
|
|
676
1298
|
// src/eval/expectations.ts
|
|
677
1299
|
function withTrajectoryGuard(evaluator, key) {
|
|
678
1300
|
return async ({ outputs, referenceOutputs }) => {
|
|
@@ -699,27 +1321,48 @@ function buildTrajectory(message, toolNames) {
|
|
|
699
1321
|
return trajectory;
|
|
700
1322
|
}
|
|
701
1323
|
function toolsCalled(tools) {
|
|
702
|
-
|
|
703
|
-
|
|
1324
|
+
const toolNames = tools.map((t) => typeof t === "string" ? t : t.name);
|
|
1325
|
+
const validators = tools.filter((t) => typeof t !== "string");
|
|
1326
|
+
return (ctx) => {
|
|
1327
|
+
const trajectoryEvaluator = ls2.wrapEvaluator(
|
|
704
1328
|
withTrajectoryGuard(
|
|
705
1329
|
(0, import_agentevals.createTrajectoryMatchEvaluator)({ trajectoryMatchMode: "superset", toolArgsMatchMode: "ignore" }),
|
|
706
1330
|
"trajectory_match"
|
|
707
1331
|
)
|
|
708
|
-
)
|
|
709
|
-
|
|
710
|
-
|
|
1332
|
+
);
|
|
1333
|
+
if (validators.length === 0) {
|
|
1334
|
+
return {
|
|
1335
|
+
evaluator: trajectoryEvaluator,
|
|
1336
|
+
referenceOutputs: { referenceTrajectory: buildTrajectory(ctx.message, toolNames) }
|
|
1337
|
+
};
|
|
1338
|
+
}
|
|
1339
|
+
const inputEvaluator = ls2.wrapEvaluator(createToolInputEvaluator());
|
|
1340
|
+
const composedEvaluator = async ({ outputs, referenceOutputs }) => {
|
|
1341
|
+
const trajectoryResult = await trajectoryEvaluator({ outputs, referenceOutputs });
|
|
1342
|
+
const inputResult = await inputEvaluator({
|
|
1343
|
+
outputs,
|
|
1344
|
+
referenceOutputs: { ...referenceOutputs, toolInputExpectations: validators }
|
|
1345
|
+
});
|
|
1346
|
+
const trajectoryPassed = Boolean(trajectoryResult.score);
|
|
1347
|
+
const inputPassed = Boolean(inputResult.score);
|
|
1348
|
+
return {
|
|
1349
|
+
key: "tools_called",
|
|
1350
|
+
score: trajectoryPassed && inputPassed,
|
|
1351
|
+
comment: [trajectoryResult.comment, inputResult.comment].filter(Boolean).join("; ")
|
|
1352
|
+
};
|
|
1353
|
+
};
|
|
1354
|
+
return {
|
|
1355
|
+
evaluator: composedEvaluator,
|
|
1356
|
+
referenceOutputs: { referenceTrajectory: buildTrajectory(ctx.message, toolNames) }
|
|
1357
|
+
};
|
|
1358
|
+
};
|
|
711
1359
|
}
|
|
712
|
-
function llmJudge() {
|
|
1360
|
+
function llmJudge(criteria) {
|
|
713
1361
|
return () => {
|
|
714
1362
|
const config = getEvalConfig();
|
|
715
1363
|
const model = config.evaluatorModel;
|
|
716
1364
|
return {
|
|
717
|
-
evaluator: ls2.wrapEvaluator(
|
|
718
|
-
withTrajectoryGuard(
|
|
719
|
-
(0, import_agentevals.createTrajectoryLLMAsJudge)({ prompt: import_agentevals.TRAJECTORY_ACCURACY_PROMPT, model }),
|
|
720
|
-
"trajectory_llm_judge"
|
|
721
|
-
)
|
|
722
|
-
),
|
|
1365
|
+
evaluator: ls2.wrapEvaluator(createLlmJudgeEvaluator(config.modelConfig, model, criteria)),
|
|
723
1366
|
referenceOutputs: {}
|
|
724
1367
|
};
|
|
725
1368
|
};
|