@mastra/evals 1.2.2 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/dist/docs/SKILL.md +1 -1
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-evals-overview.md +2 -2
- package/dist/docs/references/reference-evals-noise-sensitivity.md +1 -1
- package/dist/scorers/llm/answer-relevancy/index.d.ts.map +1 -1
- package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -1
- package/dist/scorers/llm/bias/index.d.ts.map +1 -1
- package/dist/scorers/llm/context-precision/index.d.ts.map +1 -1
- package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts.map +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -1
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts.map +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts.map +1 -1
- package/dist/scorers/llm/trajectory/index.d.ts.map +1 -1
- package/dist/scorers/prebuilt/index.cjs +656 -132
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +656 -132
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/package.json +9 -10
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures, isScorerRunInputForAgent, isScorerRunOutputForAgent } from '../../chunk-ZRHCSFKL.js';
|
|
2
2
|
import { createScorer } from '@mastra/core/evals';
|
|
3
|
-
import { z } from 'zod';
|
|
4
3
|
import nlp from 'compromise';
|
|
5
4
|
import keyword_extractor from 'keyword-extractor';
|
|
6
5
|
import stringSimilarity from 'string-similarity';
|
|
@@ -210,9 +209,21 @@ var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = `
|
|
|
210
209
|
5. Empty inputs or error messages should always be marked as "no"
|
|
211
210
|
6. Responses that discuss the type of information being asked show partial relevance
|
|
212
211
|
`;
|
|
213
|
-
var extractOutputSchema =
|
|
214
|
-
|
|
215
|
-
|
|
212
|
+
var extractOutputSchema = {
|
|
213
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
214
|
+
"type": "object",
|
|
215
|
+
"properties": {
|
|
216
|
+
"statements": {
|
|
217
|
+
"type": "array",
|
|
218
|
+
"items": {
|
|
219
|
+
"type": "string"
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
},
|
|
223
|
+
"required": [
|
|
224
|
+
"statements"
|
|
225
|
+
]
|
|
226
|
+
};
|
|
216
227
|
function createAnswerRelevancyScorer({
|
|
217
228
|
model,
|
|
218
229
|
options = DEFAULT_OPTIONS
|
|
@@ -235,7 +246,33 @@ function createAnswerRelevancyScorer({
|
|
|
235
246
|
}
|
|
236
247
|
}).analyze({
|
|
237
248
|
description: "Score the relevance of the statements to the input",
|
|
238
|
-
outputSchema:
|
|
249
|
+
outputSchema: {
|
|
250
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
251
|
+
"type": "object",
|
|
252
|
+
"properties": {
|
|
253
|
+
"results": {
|
|
254
|
+
"type": "array",
|
|
255
|
+
"items": {
|
|
256
|
+
"type": "object",
|
|
257
|
+
"properties": {
|
|
258
|
+
"result": {
|
|
259
|
+
"type": "string"
|
|
260
|
+
},
|
|
261
|
+
"reason": {
|
|
262
|
+
"type": "string"
|
|
263
|
+
}
|
|
264
|
+
},
|
|
265
|
+
"required": [
|
|
266
|
+
"result",
|
|
267
|
+
"reason"
|
|
268
|
+
]
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
},
|
|
272
|
+
"required": [
|
|
273
|
+
"results"
|
|
274
|
+
]
|
|
275
|
+
},
|
|
239
276
|
createPrompt: ({ run, results }) => {
|
|
240
277
|
const input = getUserMessageFromRunInput(run.input) ?? "";
|
|
241
278
|
return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
|
|
@@ -408,28 +445,106 @@ Key Principles:
|
|
|
408
445
|
5. Provide actionable feedback for improving answer accuracy
|
|
409
446
|
6. Be strict but fair - partial credit for partial matches
|
|
410
447
|
`;
|
|
411
|
-
var extractOutputSchema2 =
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
}
|
|
448
|
+
var extractOutputSchema2 = {
|
|
449
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
450
|
+
"type": "object",
|
|
451
|
+
"properties": {
|
|
452
|
+
"outputUnits": {
|
|
453
|
+
"type": "array",
|
|
454
|
+
"items": {
|
|
455
|
+
"type": "string"
|
|
456
|
+
}
|
|
457
|
+
},
|
|
458
|
+
"groundTruthUnits": {
|
|
459
|
+
"type": "array",
|
|
460
|
+
"items": {
|
|
461
|
+
"type": "string"
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
},
|
|
465
|
+
"required": [
|
|
466
|
+
"outputUnits",
|
|
467
|
+
"groundTruthUnits"
|
|
468
|
+
]
|
|
469
|
+
};
|
|
470
|
+
var analyzeOutputSchema = {
|
|
471
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
472
|
+
"type": "object",
|
|
473
|
+
"properties": {
|
|
474
|
+
"matches": {
|
|
475
|
+
"type": "array",
|
|
476
|
+
"items": {
|
|
477
|
+
"type": "object",
|
|
478
|
+
"properties": {
|
|
479
|
+
"groundTruthUnit": {
|
|
480
|
+
"type": "string"
|
|
481
|
+
},
|
|
482
|
+
"outputUnit": {
|
|
483
|
+
"anyOf": [
|
|
484
|
+
{
|
|
485
|
+
"type": "string"
|
|
486
|
+
},
|
|
487
|
+
{
|
|
488
|
+
"type": "null"
|
|
489
|
+
}
|
|
490
|
+
]
|
|
491
|
+
},
|
|
492
|
+
"matchType": {
|
|
493
|
+
"type": "string",
|
|
494
|
+
"enum": [
|
|
495
|
+
"exact",
|
|
496
|
+
"semantic",
|
|
497
|
+
"partial",
|
|
498
|
+
"missing"
|
|
499
|
+
]
|
|
500
|
+
},
|
|
501
|
+
"explanation": {
|
|
502
|
+
"type": "string"
|
|
503
|
+
}
|
|
504
|
+
},
|
|
505
|
+
"required": [
|
|
506
|
+
"groundTruthUnit",
|
|
507
|
+
"outputUnit",
|
|
508
|
+
"matchType",
|
|
509
|
+
"explanation"
|
|
510
|
+
]
|
|
511
|
+
}
|
|
512
|
+
},
|
|
513
|
+
"extraInOutput": {
|
|
514
|
+
"type": "array",
|
|
515
|
+
"items": {
|
|
516
|
+
"type": "string"
|
|
517
|
+
}
|
|
518
|
+
},
|
|
519
|
+
"contradictions": {
|
|
520
|
+
"type": "array",
|
|
521
|
+
"items": {
|
|
522
|
+
"type": "object",
|
|
523
|
+
"properties": {
|
|
524
|
+
"outputUnit": {
|
|
525
|
+
"type": "string"
|
|
526
|
+
},
|
|
527
|
+
"groundTruthUnit": {
|
|
528
|
+
"type": "string"
|
|
529
|
+
},
|
|
530
|
+
"explanation": {
|
|
531
|
+
"type": "string"
|
|
532
|
+
}
|
|
533
|
+
},
|
|
534
|
+
"required": [
|
|
535
|
+
"outputUnit",
|
|
536
|
+
"groundTruthUnit",
|
|
537
|
+
"explanation"
|
|
538
|
+
]
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
},
|
|
542
|
+
"required": [
|
|
543
|
+
"matches",
|
|
544
|
+
"extraInOutput",
|
|
545
|
+
"contradictions"
|
|
546
|
+
]
|
|
547
|
+
};
|
|
433
548
|
function createAnswerSimilarityScorer({
|
|
434
549
|
model,
|
|
435
550
|
options = ANSWER_SIMILARITY_DEFAULT_OPTIONS
|
|
@@ -708,16 +823,54 @@ function createFaithfulnessScorer({
|
|
|
708
823
|
type: "agent"
|
|
709
824
|
}).preprocess({
|
|
710
825
|
description: "Extract relevant statements from the LLM output",
|
|
711
|
-
outputSchema:
|
|
712
|
-
|
|
713
|
-
|
|
826
|
+
outputSchema: {
|
|
827
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
828
|
+
"type": "object",
|
|
829
|
+
"properties": {
|
|
830
|
+
"claims": {
|
|
831
|
+
"type": "array",
|
|
832
|
+
"items": {
|
|
833
|
+
"type": "string"
|
|
834
|
+
}
|
|
835
|
+
}
|
|
836
|
+
},
|
|
837
|
+
"required": [
|
|
838
|
+
"claims"
|
|
839
|
+
]
|
|
840
|
+
},
|
|
714
841
|
createPrompt: ({ run }) => {
|
|
715
842
|
const prompt = createFaithfulnessExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
716
843
|
return prompt;
|
|
717
844
|
}
|
|
718
845
|
}).analyze({
|
|
719
846
|
description: "Score the relevance of the statements to the input",
|
|
720
|
-
outputSchema:
|
|
847
|
+
outputSchema: {
|
|
848
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
849
|
+
"type": "object",
|
|
850
|
+
"properties": {
|
|
851
|
+
"verdicts": {
|
|
852
|
+
"type": "array",
|
|
853
|
+
"items": {
|
|
854
|
+
"type": "object",
|
|
855
|
+
"properties": {
|
|
856
|
+
"verdict": {
|
|
857
|
+
"type": "string"
|
|
858
|
+
},
|
|
859
|
+
"reason": {
|
|
860
|
+
"type": "string"
|
|
861
|
+
}
|
|
862
|
+
},
|
|
863
|
+
"required": [
|
|
864
|
+
"verdict",
|
|
865
|
+
"reason"
|
|
866
|
+
]
|
|
867
|
+
}
|
|
868
|
+
}
|
|
869
|
+
},
|
|
870
|
+
"required": [
|
|
871
|
+
"verdicts"
|
|
872
|
+
]
|
|
873
|
+
},
|
|
721
874
|
createPrompt: ({ results, run }) => {
|
|
722
875
|
const context = options?.context ?? getToolInvocationContext(run.output);
|
|
723
876
|
const prompt = createFaithfulnessAnalyzePrompt({
|
|
@@ -869,13 +1022,51 @@ function createBiasScorer({ model, options }) {
|
|
|
869
1022
|
type: "agent"
|
|
870
1023
|
}).preprocess({
|
|
871
1024
|
description: "Extract relevant statements from the LLM output",
|
|
872
|
-
outputSchema:
|
|
873
|
-
|
|
874
|
-
|
|
1025
|
+
outputSchema: {
|
|
1026
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
1027
|
+
"type": "object",
|
|
1028
|
+
"properties": {
|
|
1029
|
+
"opinions": {
|
|
1030
|
+
"type": "array",
|
|
1031
|
+
"items": {
|
|
1032
|
+
"type": "string"
|
|
1033
|
+
}
|
|
1034
|
+
}
|
|
1035
|
+
},
|
|
1036
|
+
"required": [
|
|
1037
|
+
"opinions"
|
|
1038
|
+
]
|
|
1039
|
+
},
|
|
875
1040
|
createPrompt: ({ run }) => createBiasExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" })
|
|
876
1041
|
}).analyze({
|
|
877
1042
|
description: "Score the relevance of the statements to the input",
|
|
878
|
-
outputSchema:
|
|
1043
|
+
outputSchema: {
|
|
1044
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
1045
|
+
"type": "object",
|
|
1046
|
+
"properties": {
|
|
1047
|
+
"results": {
|
|
1048
|
+
"type": "array",
|
|
1049
|
+
"items": {
|
|
1050
|
+
"type": "object",
|
|
1051
|
+
"properties": {
|
|
1052
|
+
"result": {
|
|
1053
|
+
"type": "string"
|
|
1054
|
+
},
|
|
1055
|
+
"reason": {
|
|
1056
|
+
"type": "string"
|
|
1057
|
+
}
|
|
1058
|
+
},
|
|
1059
|
+
"required": [
|
|
1060
|
+
"result",
|
|
1061
|
+
"reason"
|
|
1062
|
+
]
|
|
1063
|
+
}
|
|
1064
|
+
}
|
|
1065
|
+
},
|
|
1066
|
+
"required": [
|
|
1067
|
+
"results"
|
|
1068
|
+
]
|
|
1069
|
+
},
|
|
879
1070
|
createPrompt: ({ run, results }) => {
|
|
880
1071
|
const prompt = createBiasAnalyzePrompt({
|
|
881
1072
|
output: getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
@@ -1104,18 +1295,58 @@ function createHallucinationScorer({
|
|
|
1104
1295
|
type: "agent"
|
|
1105
1296
|
}).preprocess({
|
|
1106
1297
|
description: "Extract all claims from the given output",
|
|
1107
|
-
outputSchema:
|
|
1108
|
-
|
|
1109
|
-
|
|
1298
|
+
outputSchema: {
|
|
1299
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
1300
|
+
"type": "object",
|
|
1301
|
+
"properties": {
|
|
1302
|
+
"claims": {
|
|
1303
|
+
"type": "array",
|
|
1304
|
+
"items": {
|
|
1305
|
+
"type": "string"
|
|
1306
|
+
}
|
|
1307
|
+
}
|
|
1308
|
+
},
|
|
1309
|
+
"required": [
|
|
1310
|
+
"claims"
|
|
1311
|
+
]
|
|
1312
|
+
},
|
|
1110
1313
|
createPrompt: ({ run }) => {
|
|
1111
1314
|
const prompt = createHallucinationExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
1112
1315
|
return prompt;
|
|
1113
1316
|
}
|
|
1114
1317
|
}).analyze({
|
|
1115
1318
|
description: "Score the relevance of the statements to the input",
|
|
1116
|
-
outputSchema:
|
|
1117
|
-
|
|
1118
|
-
|
|
1319
|
+
outputSchema: {
|
|
1320
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
1321
|
+
"type": "object",
|
|
1322
|
+
"properties": {
|
|
1323
|
+
"verdicts": {
|
|
1324
|
+
"type": "array",
|
|
1325
|
+
"items": {
|
|
1326
|
+
"type": "object",
|
|
1327
|
+
"properties": {
|
|
1328
|
+
"statement": {
|
|
1329
|
+
"type": "string"
|
|
1330
|
+
},
|
|
1331
|
+
"verdict": {
|
|
1332
|
+
"type": "string"
|
|
1333
|
+
},
|
|
1334
|
+
"reason": {
|
|
1335
|
+
"type": "string"
|
|
1336
|
+
}
|
|
1337
|
+
},
|
|
1338
|
+
"required": [
|
|
1339
|
+
"statement",
|
|
1340
|
+
"verdict",
|
|
1341
|
+
"reason"
|
|
1342
|
+
]
|
|
1343
|
+
}
|
|
1344
|
+
}
|
|
1345
|
+
},
|
|
1346
|
+
"required": [
|
|
1347
|
+
"verdicts"
|
|
1348
|
+
]
|
|
1349
|
+
},
|
|
1119
1350
|
createPrompt: async ({ run, results }) => {
|
|
1120
1351
|
let context;
|
|
1121
1352
|
if (options?.getContext) {
|
|
@@ -1259,7 +1490,33 @@ function createToxicityScorer({
|
|
|
1259
1490
|
type: "agent"
|
|
1260
1491
|
}).analyze({
|
|
1261
1492
|
description: "Score the relevance of the statements to the input",
|
|
1262
|
-
outputSchema:
|
|
1493
|
+
outputSchema: {
|
|
1494
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
1495
|
+
"type": "object",
|
|
1496
|
+
"properties": {
|
|
1497
|
+
"verdicts": {
|
|
1498
|
+
"type": "array",
|
|
1499
|
+
"items": {
|
|
1500
|
+
"type": "object",
|
|
1501
|
+
"properties": {
|
|
1502
|
+
"verdict": {
|
|
1503
|
+
"type": "string"
|
|
1504
|
+
},
|
|
1505
|
+
"reason": {
|
|
1506
|
+
"type": "string"
|
|
1507
|
+
}
|
|
1508
|
+
},
|
|
1509
|
+
"required": [
|
|
1510
|
+
"verdict",
|
|
1511
|
+
"reason"
|
|
1512
|
+
]
|
|
1513
|
+
}
|
|
1514
|
+
}
|
|
1515
|
+
},
|
|
1516
|
+
"required": [
|
|
1517
|
+
"verdicts"
|
|
1518
|
+
]
|
|
1519
|
+
},
|
|
1263
1520
|
createPrompt: ({ run }) => {
|
|
1264
1521
|
const prompt = createToxicityAnalyzePrompt({
|
|
1265
1522
|
input: getUserMessageFromRunInput(run.input) ?? "",
|
|
@@ -1386,16 +1643,43 @@ Provide a single, concise sentence explaining why this score was given.
|
|
|
1386
1643
|
};
|
|
1387
1644
|
|
|
1388
1645
|
// src/scorers/llm/tool-call-accuracy/index.ts
|
|
1389
|
-
var analyzeOutputSchema2 =
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1646
|
+
var analyzeOutputSchema2 = {
|
|
1647
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
1648
|
+
"type": "object",
|
|
1649
|
+
"properties": {
|
|
1650
|
+
"evaluations": {
|
|
1651
|
+
"type": "array",
|
|
1652
|
+
"items": {
|
|
1653
|
+
"type": "object",
|
|
1654
|
+
"properties": {
|
|
1655
|
+
"toolCalled": {
|
|
1656
|
+
"type": "string"
|
|
1657
|
+
},
|
|
1658
|
+
"wasAppropriate": {
|
|
1659
|
+
"type": "boolean"
|
|
1660
|
+
},
|
|
1661
|
+
"reasoning": {
|
|
1662
|
+
"type": "string"
|
|
1663
|
+
}
|
|
1664
|
+
},
|
|
1665
|
+
"required": [
|
|
1666
|
+
"toolCalled",
|
|
1667
|
+
"wasAppropriate",
|
|
1668
|
+
"reasoning"
|
|
1669
|
+
]
|
|
1670
|
+
}
|
|
1671
|
+
},
|
|
1672
|
+
"missingTools": {
|
|
1673
|
+
"type": "array",
|
|
1674
|
+
"items": {
|
|
1675
|
+
"type": "string"
|
|
1676
|
+
}
|
|
1677
|
+
}
|
|
1678
|
+
},
|
|
1679
|
+
"required": [
|
|
1680
|
+
"evaluations"
|
|
1681
|
+
]
|
|
1682
|
+
};
|
|
1399
1683
|
function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
1400
1684
|
const toolDefinitions = availableTools.map((tool) => `${tool.id}: ${tool.description}`).join("\n");
|
|
1401
1685
|
return createScorer({
|
|
@@ -1606,19 +1890,62 @@ Example responses:
|
|
|
1606
1890
|
}
|
|
1607
1891
|
|
|
1608
1892
|
// src/scorers/llm/context-relevance/index.ts
|
|
1609
|
-
var analyzeOutputSchema3 =
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
|
|
1618
|
-
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1893
|
+
var analyzeOutputSchema3 = {
|
|
1894
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
1895
|
+
"type": "object",
|
|
1896
|
+
"properties": {
|
|
1897
|
+
"evaluations": {
|
|
1898
|
+
"type": "array",
|
|
1899
|
+
"items": {
|
|
1900
|
+
"type": "object",
|
|
1901
|
+
"properties": {
|
|
1902
|
+
"context_index": {
|
|
1903
|
+
"type": "number"
|
|
1904
|
+
},
|
|
1905
|
+
"contextPiece": {
|
|
1906
|
+
"type": "string"
|
|
1907
|
+
},
|
|
1908
|
+
"relevanceLevel": {
|
|
1909
|
+
"type": "string",
|
|
1910
|
+
"enum": [
|
|
1911
|
+
"high",
|
|
1912
|
+
"medium",
|
|
1913
|
+
"low",
|
|
1914
|
+
"none"
|
|
1915
|
+
]
|
|
1916
|
+
},
|
|
1917
|
+
"wasUsed": {
|
|
1918
|
+
"type": "boolean"
|
|
1919
|
+
},
|
|
1920
|
+
"reasoning": {
|
|
1921
|
+
"type": "string"
|
|
1922
|
+
}
|
|
1923
|
+
},
|
|
1924
|
+
"required": [
|
|
1925
|
+
"context_index",
|
|
1926
|
+
"contextPiece",
|
|
1927
|
+
"relevanceLevel",
|
|
1928
|
+
"wasUsed",
|
|
1929
|
+
"reasoning"
|
|
1930
|
+
]
|
|
1931
|
+
}
|
|
1932
|
+
},
|
|
1933
|
+
"missingContext": {
|
|
1934
|
+
"default": [],
|
|
1935
|
+
"type": "array",
|
|
1936
|
+
"items": {
|
|
1937
|
+
"type": "string"
|
|
1938
|
+
}
|
|
1939
|
+
},
|
|
1940
|
+
"overallAssessment": {
|
|
1941
|
+
"type": "string"
|
|
1942
|
+
}
|
|
1943
|
+
},
|
|
1944
|
+
"required": [
|
|
1945
|
+
"evaluations",
|
|
1946
|
+
"overallAssessment"
|
|
1947
|
+
]
|
|
1948
|
+
};
|
|
1622
1949
|
var DEFAULT_PENALTIES = {
|
|
1623
1950
|
UNUSED_HIGH_RELEVANCE_CONTEXT: 0.1,
|
|
1624
1951
|
// 10% penalty per unused high-relevance context
|
|
@@ -1852,15 +2179,37 @@ Example responses:
|
|
|
1852
2179
|
}
|
|
1853
2180
|
|
|
1854
2181
|
// src/scorers/llm/context-precision/index.ts
|
|
1855
|
-
var contextRelevanceOutputSchema =
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
2182
|
+
var contextRelevanceOutputSchema = {
|
|
2183
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
2184
|
+
"type": "object",
|
|
2185
|
+
"properties": {
|
|
2186
|
+
"verdicts": {
|
|
2187
|
+
"type": "array",
|
|
2188
|
+
"items": {
|
|
2189
|
+
"type": "object",
|
|
2190
|
+
"properties": {
|
|
2191
|
+
"context_index": {
|
|
2192
|
+
"type": "number"
|
|
2193
|
+
},
|
|
2194
|
+
"verdict": {
|
|
2195
|
+
"type": "string"
|
|
2196
|
+
},
|
|
2197
|
+
"reason": {
|
|
2198
|
+
"type": "string"
|
|
2199
|
+
}
|
|
2200
|
+
},
|
|
2201
|
+
"required": [
|
|
2202
|
+
"context_index",
|
|
2203
|
+
"verdict",
|
|
2204
|
+
"reason"
|
|
2205
|
+
]
|
|
2206
|
+
}
|
|
2207
|
+
}
|
|
2208
|
+
},
|
|
2209
|
+
"required": [
|
|
2210
|
+
"verdicts"
|
|
2211
|
+
]
|
|
2212
|
+
};
|
|
1864
2213
|
var getContext2 = ({
|
|
1865
2214
|
input,
|
|
1866
2215
|
output,
|
|
@@ -2139,20 +2488,63 @@ Example responses:
|
|
|
2139
2488
|
}
|
|
2140
2489
|
|
|
2141
2490
|
// src/scorers/llm/noise-sensitivity/index.ts
|
|
2142
|
-
var
|
|
2143
|
-
|
|
2144
|
-
|
|
2145
|
-
|
|
2146
|
-
|
|
2147
|
-
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2491
|
+
var analyzeOutputSchema4 = {
|
|
2492
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
2493
|
+
"type": "object",
|
|
2494
|
+
"properties": {
|
|
2495
|
+
"dimensions": {
|
|
2496
|
+
"type": "array",
|
|
2497
|
+
"items": {
|
|
2498
|
+
"type": "object",
|
|
2499
|
+
"properties": {
|
|
2500
|
+
"dimension": {
|
|
2501
|
+
"type": "string"
|
|
2502
|
+
},
|
|
2503
|
+
"impactLevel": {
|
|
2504
|
+
"type": "string",
|
|
2505
|
+
"enum": [
|
|
2506
|
+
"none",
|
|
2507
|
+
"minimal",
|
|
2508
|
+
"moderate",
|
|
2509
|
+
"significant",
|
|
2510
|
+
"severe"
|
|
2511
|
+
]
|
|
2512
|
+
},
|
|
2513
|
+
"specificChanges": {
|
|
2514
|
+
"type": "string"
|
|
2515
|
+
},
|
|
2516
|
+
"noiseInfluence": {
|
|
2517
|
+
"type": "string"
|
|
2518
|
+
}
|
|
2519
|
+
},
|
|
2520
|
+
"required": [
|
|
2521
|
+
"dimension",
|
|
2522
|
+
"impactLevel",
|
|
2523
|
+
"specificChanges",
|
|
2524
|
+
"noiseInfluence"
|
|
2525
|
+
]
|
|
2526
|
+
}
|
|
2527
|
+
},
|
|
2528
|
+
"overallAssessment": {
|
|
2529
|
+
"type": "string"
|
|
2530
|
+
},
|
|
2531
|
+
"majorIssues": {
|
|
2532
|
+
"default": [],
|
|
2533
|
+
"type": "array",
|
|
2534
|
+
"items": {
|
|
2535
|
+
"type": "string"
|
|
2536
|
+
}
|
|
2537
|
+
},
|
|
2538
|
+
"robustnessScore": {
|
|
2539
|
+
"type": "number"
|
|
2540
|
+
}
|
|
2541
|
+
},
|
|
2542
|
+
"required": [
|
|
2543
|
+
"dimensions",
|
|
2544
|
+
"overallAssessment",
|
|
2545
|
+
"robustnessScore"
|
|
2546
|
+
]
|
|
2547
|
+
};
|
|
2156
2548
|
var DEFAULT_IMPACT_WEIGHTS = {
|
|
2157
2549
|
none: 1,
|
|
2158
2550
|
minimal: 0.85,
|
|
@@ -2485,37 +2877,124 @@ Example responses:
|
|
|
2485
2877
|
}
|
|
2486
2878
|
|
|
2487
2879
|
// src/scorers/llm/prompt-alignment/index.ts
|
|
2488
|
-
var
|
|
2489
|
-
|
|
2490
|
-
|
|
2491
|
-
|
|
2492
|
-
|
|
2493
|
-
|
|
2494
|
-
|
|
2495
|
-
|
|
2496
|
-
|
|
2497
|
-
|
|
2498
|
-
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2502
|
-
|
|
2503
|
-
|
|
2504
|
-
|
|
2505
|
-
|
|
2506
|
-
|
|
2507
|
-
|
|
2508
|
-
|
|
2509
|
-
|
|
2510
|
-
|
|
2511
|
-
|
|
2512
|
-
|
|
2513
|
-
|
|
2514
|
-
|
|
2515
|
-
|
|
2516
|
-
|
|
2517
|
-
|
|
2518
|
-
|
|
2880
|
+
var analyzeOutputSchema5 = {
|
|
2881
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
2882
|
+
"type": "object",
|
|
2883
|
+
"properties": {
|
|
2884
|
+
"intentAlignment": {
|
|
2885
|
+
"type": "object",
|
|
2886
|
+
"properties": {
|
|
2887
|
+
"score": {
|
|
2888
|
+
"type": "number"
|
|
2889
|
+
},
|
|
2890
|
+
"primaryIntent": {
|
|
2891
|
+
"type": "string"
|
|
2892
|
+
},
|
|
2893
|
+
"isAddressed": {
|
|
2894
|
+
"type": "boolean"
|
|
2895
|
+
},
|
|
2896
|
+
"reasoning": {
|
|
2897
|
+
"type": "string"
|
|
2898
|
+
}
|
|
2899
|
+
},
|
|
2900
|
+
"required": [
|
|
2901
|
+
"score",
|
|
2902
|
+
"primaryIntent",
|
|
2903
|
+
"isAddressed",
|
|
2904
|
+
"reasoning"
|
|
2905
|
+
]
|
|
2906
|
+
},
|
|
2907
|
+
"requirementsFulfillment": {
|
|
2908
|
+
"type": "object",
|
|
2909
|
+
"properties": {
|
|
2910
|
+
"requirements": {
|
|
2911
|
+
"type": "array",
|
|
2912
|
+
"items": {
|
|
2913
|
+
"type": "object",
|
|
2914
|
+
"properties": {
|
|
2915
|
+
"requirement": {
|
|
2916
|
+
"type": "string"
|
|
2917
|
+
},
|
|
2918
|
+
"isFulfilled": {
|
|
2919
|
+
"type": "boolean"
|
|
2920
|
+
},
|
|
2921
|
+
"reasoning": {
|
|
2922
|
+
"type": "string"
|
|
2923
|
+
}
|
|
2924
|
+
},
|
|
2925
|
+
"required": [
|
|
2926
|
+
"requirement",
|
|
2927
|
+
"isFulfilled",
|
|
2928
|
+
"reasoning"
|
|
2929
|
+
]
|
|
2930
|
+
}
|
|
2931
|
+
},
|
|
2932
|
+
"overallScore": {
|
|
2933
|
+
"type": "number"
|
|
2934
|
+
}
|
|
2935
|
+
},
|
|
2936
|
+
"required": [
|
|
2937
|
+
"requirements",
|
|
2938
|
+
"overallScore"
|
|
2939
|
+
]
|
|
2940
|
+
},
|
|
2941
|
+
"completeness": {
|
|
2942
|
+
"type": "object",
|
|
2943
|
+
"properties": {
|
|
2944
|
+
"score": {
|
|
2945
|
+
"type": "number"
|
|
2946
|
+
},
|
|
2947
|
+
"missingElements": {
|
|
2948
|
+
"type": "array",
|
|
2949
|
+
"items": {
|
|
2950
|
+
"type": "string"
|
|
2951
|
+
}
|
|
2952
|
+
},
|
|
2953
|
+
"reasoning": {
|
|
2954
|
+
"type": "string"
|
|
2955
|
+
}
|
|
2956
|
+
},
|
|
2957
|
+
"required": [
|
|
2958
|
+
"score",
|
|
2959
|
+
"missingElements",
|
|
2960
|
+
"reasoning"
|
|
2961
|
+
]
|
|
2962
|
+
},
|
|
2963
|
+
"responseAppropriateness": {
|
|
2964
|
+
"type": "object",
|
|
2965
|
+
"properties": {
|
|
2966
|
+
"score": {
|
|
2967
|
+
"type": "number"
|
|
2968
|
+
},
|
|
2969
|
+
"formatAlignment": {
|
|
2970
|
+
"type": "boolean"
|
|
2971
|
+
},
|
|
2972
|
+
"toneAlignment": {
|
|
2973
|
+
"type": "boolean"
|
|
2974
|
+
},
|
|
2975
|
+
"reasoning": {
|
|
2976
|
+
"type": "string"
|
|
2977
|
+
}
|
|
2978
|
+
},
|
|
2979
|
+
"required": [
|
|
2980
|
+
"score",
|
|
2981
|
+
"formatAlignment",
|
|
2982
|
+
"toneAlignment",
|
|
2983
|
+
"reasoning"
|
|
2984
|
+
]
|
|
2985
|
+
},
|
|
2986
|
+
"overallAssessment": {
|
|
2987
|
+
"type": "string"
|
|
2988
|
+
}
|
|
2989
|
+
},
|
|
2990
|
+
"required": [
|
|
2991
|
+
"intentAlignment",
|
|
2992
|
+
"requirementsFulfillment",
|
|
2993
|
+
"completeness",
|
|
2994
|
+
"responseAppropriateness",
|
|
2995
|
+
"overallAssessment"
|
|
2996
|
+
]
|
|
2997
|
+
};
|
|
2519
2998
|
var SCORING_WEIGHTS = {
|
|
2520
2999
|
USER: {
|
|
2521
3000
|
INTENT_ALIGNMENT: 0.4,
|
|
@@ -2710,19 +3189,64 @@ Provide a single, concise sentence explaining why this score was given.
|
|
|
2710
3189
|
};
|
|
2711
3190
|
|
|
2712
3191
|
// src/scorers/llm/trajectory/index.ts
|
|
2713
|
-
var analyzeOutputSchema6 =
|
|
2714
|
-
|
|
2715
|
-
|
|
2716
|
-
|
|
2717
|
-
|
|
2718
|
-
|
|
2719
|
-
|
|
2720
|
-
|
|
2721
|
-
|
|
2722
|
-
|
|
2723
|
-
|
|
2724
|
-
|
|
2725
|
-
}
|
|
3192
|
+
var analyzeOutputSchema6 = {
|
|
3193
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3194
|
+
"type": "object",
|
|
3195
|
+
"properties": {
|
|
3196
|
+
"stepEvaluations": {
|
|
3197
|
+
"type": "array",
|
|
3198
|
+
"items": {
|
|
3199
|
+
"type": "object",
|
|
3200
|
+
"properties": {
|
|
3201
|
+
"stepName": {
|
|
3202
|
+
"type": "string",
|
|
3203
|
+
"description": "Name of the step (tool name or action)"
|
|
3204
|
+
},
|
|
3205
|
+
"wasNecessary": {
|
|
3206
|
+
"type": "boolean",
|
|
3207
|
+
"description": "Whether this step was necessary for the task"
|
|
3208
|
+
},
|
|
3209
|
+
"wasInOrder": {
|
|
3210
|
+
"type": "boolean",
|
|
3211
|
+
"description": "Whether this step was in a logical position in the sequence"
|
|
3212
|
+
},
|
|
3213
|
+
"reasoning": {
|
|
3214
|
+
"type": "string",
|
|
3215
|
+
"description": "Brief explanation of the evaluation"
|
|
3216
|
+
}
|
|
3217
|
+
},
|
|
3218
|
+
"required": [
|
|
3219
|
+
"stepName",
|
|
3220
|
+
"wasNecessary",
|
|
3221
|
+
"wasInOrder",
|
|
3222
|
+
"reasoning"
|
|
3223
|
+
]
|
|
3224
|
+
}
|
|
3225
|
+
},
|
|
3226
|
+
"missingSteps": {
|
|
3227
|
+
"description": "Steps that should have been taken but were not",
|
|
3228
|
+
"type": "array",
|
|
3229
|
+
"items": {
|
|
3230
|
+
"type": "string"
|
|
3231
|
+
}
|
|
3232
|
+
},
|
|
3233
|
+
"extraSteps": {
|
|
3234
|
+
"description": "Steps that were unnecessary or redundant",
|
|
3235
|
+
"type": "array",
|
|
3236
|
+
"items": {
|
|
3237
|
+
"type": "string"
|
|
3238
|
+
}
|
|
3239
|
+
},
|
|
3240
|
+
"overallAssessment": {
|
|
3241
|
+
"type": "string",
|
|
3242
|
+
"description": "Brief overall assessment of the trajectory quality"
|
|
3243
|
+
}
|
|
3244
|
+
},
|
|
3245
|
+
"required": [
|
|
3246
|
+
"stepEvaluations",
|
|
3247
|
+
"overallAssessment"
|
|
3248
|
+
]
|
|
3249
|
+
};
|
|
2726
3250
|
function formatStepDetails(step) {
|
|
2727
3251
|
switch (step.stepType) {
|
|
2728
3252
|
case "tool_call":
|