@mastra/evals 1.2.2-alpha.0 → 1.2.3-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +28 -0
- package/dist/docs/SKILL.md +1 -1
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-evals-overview.md +2 -2
- package/dist/docs/references/reference-evals-noise-sensitivity.md +1 -1
- package/dist/scorers/llm/answer-relevancy/index.d.ts.map +1 -1
- package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -1
- package/dist/scorers/llm/bias/index.d.ts.map +1 -1
- package/dist/scorers/llm/context-precision/index.d.ts.map +1 -1
- package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts.map +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -1
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts.map +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts.map +1 -1
- package/dist/scorers/llm/trajectory/index.d.ts.map +1 -1
- package/dist/scorers/prebuilt/index.cjs +656 -132
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +656 -132
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/package.json +9 -10
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
var chunk33T2SZZ2_cjs = require('../../chunk-33T2SZZ2.cjs');
|
|
4
4
|
var evals = require('@mastra/core/evals');
|
|
5
|
-
var zod = require('zod');
|
|
6
5
|
var nlp = require('compromise');
|
|
7
6
|
var keyword_extractor = require('keyword-extractor');
|
|
8
7
|
var stringSimilarity = require('string-similarity');
|
|
@@ -219,9 +218,21 @@ var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = `
|
|
|
219
218
|
5. Empty inputs or error messages should always be marked as "no"
|
|
220
219
|
6. Responses that discuss the type of information being asked show partial relevance
|
|
221
220
|
`;
|
|
222
|
-
var extractOutputSchema =
|
|
223
|
-
|
|
224
|
-
|
|
221
|
+
var extractOutputSchema = {
|
|
222
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
223
|
+
"type": "object",
|
|
224
|
+
"properties": {
|
|
225
|
+
"statements": {
|
|
226
|
+
"type": "array",
|
|
227
|
+
"items": {
|
|
228
|
+
"type": "string"
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
},
|
|
232
|
+
"required": [
|
|
233
|
+
"statements"
|
|
234
|
+
]
|
|
235
|
+
};
|
|
225
236
|
function createAnswerRelevancyScorer({
|
|
226
237
|
model,
|
|
227
238
|
options = DEFAULT_OPTIONS
|
|
@@ -244,7 +255,33 @@ function createAnswerRelevancyScorer({
|
|
|
244
255
|
}
|
|
245
256
|
}).analyze({
|
|
246
257
|
description: "Score the relevance of the statements to the input",
|
|
247
|
-
outputSchema:
|
|
258
|
+
outputSchema: {
|
|
259
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
260
|
+
"type": "object",
|
|
261
|
+
"properties": {
|
|
262
|
+
"results": {
|
|
263
|
+
"type": "array",
|
|
264
|
+
"items": {
|
|
265
|
+
"type": "object",
|
|
266
|
+
"properties": {
|
|
267
|
+
"result": {
|
|
268
|
+
"type": "string"
|
|
269
|
+
},
|
|
270
|
+
"reason": {
|
|
271
|
+
"type": "string"
|
|
272
|
+
}
|
|
273
|
+
},
|
|
274
|
+
"required": [
|
|
275
|
+
"result",
|
|
276
|
+
"reason"
|
|
277
|
+
]
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
},
|
|
281
|
+
"required": [
|
|
282
|
+
"results"
|
|
283
|
+
]
|
|
284
|
+
},
|
|
248
285
|
createPrompt: ({ run, results }) => {
|
|
249
286
|
const input = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
250
287
|
return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
|
|
@@ -417,28 +454,106 @@ Key Principles:
|
|
|
417
454
|
5. Provide actionable feedback for improving answer accuracy
|
|
418
455
|
6. Be strict but fair - partial credit for partial matches
|
|
419
456
|
`;
|
|
420
|
-
var extractOutputSchema2 =
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
}
|
|
457
|
+
var extractOutputSchema2 = {
|
|
458
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
459
|
+
"type": "object",
|
|
460
|
+
"properties": {
|
|
461
|
+
"outputUnits": {
|
|
462
|
+
"type": "array",
|
|
463
|
+
"items": {
|
|
464
|
+
"type": "string"
|
|
465
|
+
}
|
|
466
|
+
},
|
|
467
|
+
"groundTruthUnits": {
|
|
468
|
+
"type": "array",
|
|
469
|
+
"items": {
|
|
470
|
+
"type": "string"
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
},
|
|
474
|
+
"required": [
|
|
475
|
+
"outputUnits",
|
|
476
|
+
"groundTruthUnits"
|
|
477
|
+
]
|
|
478
|
+
};
|
|
479
|
+
var analyzeOutputSchema = {
|
|
480
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
481
|
+
"type": "object",
|
|
482
|
+
"properties": {
|
|
483
|
+
"matches": {
|
|
484
|
+
"type": "array",
|
|
485
|
+
"items": {
|
|
486
|
+
"type": "object",
|
|
487
|
+
"properties": {
|
|
488
|
+
"groundTruthUnit": {
|
|
489
|
+
"type": "string"
|
|
490
|
+
},
|
|
491
|
+
"outputUnit": {
|
|
492
|
+
"anyOf": [
|
|
493
|
+
{
|
|
494
|
+
"type": "string"
|
|
495
|
+
},
|
|
496
|
+
{
|
|
497
|
+
"type": "null"
|
|
498
|
+
}
|
|
499
|
+
]
|
|
500
|
+
},
|
|
501
|
+
"matchType": {
|
|
502
|
+
"type": "string",
|
|
503
|
+
"enum": [
|
|
504
|
+
"exact",
|
|
505
|
+
"semantic",
|
|
506
|
+
"partial",
|
|
507
|
+
"missing"
|
|
508
|
+
]
|
|
509
|
+
},
|
|
510
|
+
"explanation": {
|
|
511
|
+
"type": "string"
|
|
512
|
+
}
|
|
513
|
+
},
|
|
514
|
+
"required": [
|
|
515
|
+
"groundTruthUnit",
|
|
516
|
+
"outputUnit",
|
|
517
|
+
"matchType",
|
|
518
|
+
"explanation"
|
|
519
|
+
]
|
|
520
|
+
}
|
|
521
|
+
},
|
|
522
|
+
"extraInOutput": {
|
|
523
|
+
"type": "array",
|
|
524
|
+
"items": {
|
|
525
|
+
"type": "string"
|
|
526
|
+
}
|
|
527
|
+
},
|
|
528
|
+
"contradictions": {
|
|
529
|
+
"type": "array",
|
|
530
|
+
"items": {
|
|
531
|
+
"type": "object",
|
|
532
|
+
"properties": {
|
|
533
|
+
"outputUnit": {
|
|
534
|
+
"type": "string"
|
|
535
|
+
},
|
|
536
|
+
"groundTruthUnit": {
|
|
537
|
+
"type": "string"
|
|
538
|
+
},
|
|
539
|
+
"explanation": {
|
|
540
|
+
"type": "string"
|
|
541
|
+
}
|
|
542
|
+
},
|
|
543
|
+
"required": [
|
|
544
|
+
"outputUnit",
|
|
545
|
+
"groundTruthUnit",
|
|
546
|
+
"explanation"
|
|
547
|
+
]
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
},
|
|
551
|
+
"required": [
|
|
552
|
+
"matches",
|
|
553
|
+
"extraInOutput",
|
|
554
|
+
"contradictions"
|
|
555
|
+
]
|
|
556
|
+
};
|
|
442
557
|
function createAnswerSimilarityScorer({
|
|
443
558
|
model,
|
|
444
559
|
options = ANSWER_SIMILARITY_DEFAULT_OPTIONS
|
|
@@ -717,16 +832,54 @@ function createFaithfulnessScorer({
|
|
|
717
832
|
type: "agent"
|
|
718
833
|
}).preprocess({
|
|
719
834
|
description: "Extract relevant statements from the LLM output",
|
|
720
|
-
outputSchema:
|
|
721
|
-
|
|
722
|
-
|
|
835
|
+
outputSchema: {
|
|
836
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
837
|
+
"type": "object",
|
|
838
|
+
"properties": {
|
|
839
|
+
"claims": {
|
|
840
|
+
"type": "array",
|
|
841
|
+
"items": {
|
|
842
|
+
"type": "string"
|
|
843
|
+
}
|
|
844
|
+
}
|
|
845
|
+
},
|
|
846
|
+
"required": [
|
|
847
|
+
"claims"
|
|
848
|
+
]
|
|
849
|
+
},
|
|
723
850
|
createPrompt: ({ run }) => {
|
|
724
851
|
const prompt = createFaithfulnessExtractPrompt({ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
725
852
|
return prompt;
|
|
726
853
|
}
|
|
727
854
|
}).analyze({
|
|
728
855
|
description: "Score the relevance of the statements to the input",
|
|
729
|
-
outputSchema:
|
|
856
|
+
outputSchema: {
|
|
857
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
858
|
+
"type": "object",
|
|
859
|
+
"properties": {
|
|
860
|
+
"verdicts": {
|
|
861
|
+
"type": "array",
|
|
862
|
+
"items": {
|
|
863
|
+
"type": "object",
|
|
864
|
+
"properties": {
|
|
865
|
+
"verdict": {
|
|
866
|
+
"type": "string"
|
|
867
|
+
},
|
|
868
|
+
"reason": {
|
|
869
|
+
"type": "string"
|
|
870
|
+
}
|
|
871
|
+
},
|
|
872
|
+
"required": [
|
|
873
|
+
"verdict",
|
|
874
|
+
"reason"
|
|
875
|
+
]
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
},
|
|
879
|
+
"required": [
|
|
880
|
+
"verdicts"
|
|
881
|
+
]
|
|
882
|
+
},
|
|
730
883
|
createPrompt: ({ results, run }) => {
|
|
731
884
|
const context = options?.context ?? getToolInvocationContext(run.output);
|
|
732
885
|
const prompt = createFaithfulnessAnalyzePrompt({
|
|
@@ -878,13 +1031,51 @@ function createBiasScorer({ model, options }) {
|
|
|
878
1031
|
type: "agent"
|
|
879
1032
|
}).preprocess({
|
|
880
1033
|
description: "Extract relevant statements from the LLM output",
|
|
881
|
-
outputSchema:
|
|
882
|
-
|
|
883
|
-
|
|
1034
|
+
outputSchema: {
|
|
1035
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
1036
|
+
"type": "object",
|
|
1037
|
+
"properties": {
|
|
1038
|
+
"opinions": {
|
|
1039
|
+
"type": "array",
|
|
1040
|
+
"items": {
|
|
1041
|
+
"type": "string"
|
|
1042
|
+
}
|
|
1043
|
+
}
|
|
1044
|
+
},
|
|
1045
|
+
"required": [
|
|
1046
|
+
"opinions"
|
|
1047
|
+
]
|
|
1048
|
+
},
|
|
884
1049
|
createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
|
|
885
1050
|
}).analyze({
|
|
886
1051
|
description: "Score the relevance of the statements to the input",
|
|
887
|
-
outputSchema:
|
|
1052
|
+
outputSchema: {
|
|
1053
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
1054
|
+
"type": "object",
|
|
1055
|
+
"properties": {
|
|
1056
|
+
"results": {
|
|
1057
|
+
"type": "array",
|
|
1058
|
+
"items": {
|
|
1059
|
+
"type": "object",
|
|
1060
|
+
"properties": {
|
|
1061
|
+
"result": {
|
|
1062
|
+
"type": "string"
|
|
1063
|
+
},
|
|
1064
|
+
"reason": {
|
|
1065
|
+
"type": "string"
|
|
1066
|
+
}
|
|
1067
|
+
},
|
|
1068
|
+
"required": [
|
|
1069
|
+
"result",
|
|
1070
|
+
"reason"
|
|
1071
|
+
]
|
|
1072
|
+
}
|
|
1073
|
+
}
|
|
1074
|
+
},
|
|
1075
|
+
"required": [
|
|
1076
|
+
"results"
|
|
1077
|
+
]
|
|
1078
|
+
},
|
|
888
1079
|
createPrompt: ({ run, results }) => {
|
|
889
1080
|
const prompt = createBiasAnalyzePrompt({
|
|
890
1081
|
output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
@@ -1113,18 +1304,58 @@ function createHallucinationScorer({
|
|
|
1113
1304
|
type: "agent"
|
|
1114
1305
|
}).preprocess({
|
|
1115
1306
|
description: "Extract all claims from the given output",
|
|
1116
|
-
outputSchema:
|
|
1117
|
-
|
|
1118
|
-
|
|
1307
|
+
outputSchema: {
|
|
1308
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
1309
|
+
"type": "object",
|
|
1310
|
+
"properties": {
|
|
1311
|
+
"claims": {
|
|
1312
|
+
"type": "array",
|
|
1313
|
+
"items": {
|
|
1314
|
+
"type": "string"
|
|
1315
|
+
}
|
|
1316
|
+
}
|
|
1317
|
+
},
|
|
1318
|
+
"required": [
|
|
1319
|
+
"claims"
|
|
1320
|
+
]
|
|
1321
|
+
},
|
|
1119
1322
|
createPrompt: ({ run }) => {
|
|
1120
1323
|
const prompt = createHallucinationExtractPrompt({ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
1121
1324
|
return prompt;
|
|
1122
1325
|
}
|
|
1123
1326
|
}).analyze({
|
|
1124
1327
|
description: "Score the relevance of the statements to the input",
|
|
1125
|
-
outputSchema:
|
|
1126
|
-
|
|
1127
|
-
|
|
1328
|
+
outputSchema: {
|
|
1329
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
1330
|
+
"type": "object",
|
|
1331
|
+
"properties": {
|
|
1332
|
+
"verdicts": {
|
|
1333
|
+
"type": "array",
|
|
1334
|
+
"items": {
|
|
1335
|
+
"type": "object",
|
|
1336
|
+
"properties": {
|
|
1337
|
+
"statement": {
|
|
1338
|
+
"type": "string"
|
|
1339
|
+
},
|
|
1340
|
+
"verdict": {
|
|
1341
|
+
"type": "string"
|
|
1342
|
+
},
|
|
1343
|
+
"reason": {
|
|
1344
|
+
"type": "string"
|
|
1345
|
+
}
|
|
1346
|
+
},
|
|
1347
|
+
"required": [
|
|
1348
|
+
"statement",
|
|
1349
|
+
"verdict",
|
|
1350
|
+
"reason"
|
|
1351
|
+
]
|
|
1352
|
+
}
|
|
1353
|
+
}
|
|
1354
|
+
},
|
|
1355
|
+
"required": [
|
|
1356
|
+
"verdicts"
|
|
1357
|
+
]
|
|
1358
|
+
},
|
|
1128
1359
|
createPrompt: async ({ run, results }) => {
|
|
1129
1360
|
let context;
|
|
1130
1361
|
if (options?.getContext) {
|
|
@@ -1268,7 +1499,33 @@ function createToxicityScorer({
|
|
|
1268
1499
|
type: "agent"
|
|
1269
1500
|
}).analyze({
|
|
1270
1501
|
description: "Score the relevance of the statements to the input",
|
|
1271
|
-
outputSchema:
|
|
1502
|
+
outputSchema: {
|
|
1503
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
1504
|
+
"type": "object",
|
|
1505
|
+
"properties": {
|
|
1506
|
+
"verdicts": {
|
|
1507
|
+
"type": "array",
|
|
1508
|
+
"items": {
|
|
1509
|
+
"type": "object",
|
|
1510
|
+
"properties": {
|
|
1511
|
+
"verdict": {
|
|
1512
|
+
"type": "string"
|
|
1513
|
+
},
|
|
1514
|
+
"reason": {
|
|
1515
|
+
"type": "string"
|
|
1516
|
+
}
|
|
1517
|
+
},
|
|
1518
|
+
"required": [
|
|
1519
|
+
"verdict",
|
|
1520
|
+
"reason"
|
|
1521
|
+
]
|
|
1522
|
+
}
|
|
1523
|
+
}
|
|
1524
|
+
},
|
|
1525
|
+
"required": [
|
|
1526
|
+
"verdicts"
|
|
1527
|
+
]
|
|
1528
|
+
},
|
|
1272
1529
|
createPrompt: ({ run }) => {
|
|
1273
1530
|
const prompt = createToxicityAnalyzePrompt({
|
|
1274
1531
|
input: chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
@@ -1395,16 +1652,43 @@ Provide a single, concise sentence explaining why this score was given.
|
|
|
1395
1652
|
};
|
|
1396
1653
|
|
|
1397
1654
|
// src/scorers/llm/tool-call-accuracy/index.ts
|
|
1398
|
-
var analyzeOutputSchema2 =
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1655
|
+
var analyzeOutputSchema2 = {
|
|
1656
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
1657
|
+
"type": "object",
|
|
1658
|
+
"properties": {
|
|
1659
|
+
"evaluations": {
|
|
1660
|
+
"type": "array",
|
|
1661
|
+
"items": {
|
|
1662
|
+
"type": "object",
|
|
1663
|
+
"properties": {
|
|
1664
|
+
"toolCalled": {
|
|
1665
|
+
"type": "string"
|
|
1666
|
+
},
|
|
1667
|
+
"wasAppropriate": {
|
|
1668
|
+
"type": "boolean"
|
|
1669
|
+
},
|
|
1670
|
+
"reasoning": {
|
|
1671
|
+
"type": "string"
|
|
1672
|
+
}
|
|
1673
|
+
},
|
|
1674
|
+
"required": [
|
|
1675
|
+
"toolCalled",
|
|
1676
|
+
"wasAppropriate",
|
|
1677
|
+
"reasoning"
|
|
1678
|
+
]
|
|
1679
|
+
}
|
|
1680
|
+
},
|
|
1681
|
+
"missingTools": {
|
|
1682
|
+
"type": "array",
|
|
1683
|
+
"items": {
|
|
1684
|
+
"type": "string"
|
|
1685
|
+
}
|
|
1686
|
+
}
|
|
1687
|
+
},
|
|
1688
|
+
"required": [
|
|
1689
|
+
"evaluations"
|
|
1690
|
+
]
|
|
1691
|
+
};
|
|
1408
1692
|
function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
1409
1693
|
const toolDefinitions = availableTools.map((tool) => `${tool.id}: ${tool.description}`).join("\n");
|
|
1410
1694
|
return evals.createScorer({
|
|
@@ -1615,19 +1899,62 @@ Example responses:
|
|
|
1615
1899
|
}
|
|
1616
1900
|
|
|
1617
1901
|
// src/scorers/llm/context-relevance/index.ts
|
|
1618
|
-
var analyzeOutputSchema3 =
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1902
|
+
var analyzeOutputSchema3 = {
|
|
1903
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
1904
|
+
"type": "object",
|
|
1905
|
+
"properties": {
|
|
1906
|
+
"evaluations": {
|
|
1907
|
+
"type": "array",
|
|
1908
|
+
"items": {
|
|
1909
|
+
"type": "object",
|
|
1910
|
+
"properties": {
|
|
1911
|
+
"context_index": {
|
|
1912
|
+
"type": "number"
|
|
1913
|
+
},
|
|
1914
|
+
"contextPiece": {
|
|
1915
|
+
"type": "string"
|
|
1916
|
+
},
|
|
1917
|
+
"relevanceLevel": {
|
|
1918
|
+
"type": "string",
|
|
1919
|
+
"enum": [
|
|
1920
|
+
"high",
|
|
1921
|
+
"medium",
|
|
1922
|
+
"low",
|
|
1923
|
+
"none"
|
|
1924
|
+
]
|
|
1925
|
+
},
|
|
1926
|
+
"wasUsed": {
|
|
1927
|
+
"type": "boolean"
|
|
1928
|
+
},
|
|
1929
|
+
"reasoning": {
|
|
1930
|
+
"type": "string"
|
|
1931
|
+
}
|
|
1932
|
+
},
|
|
1933
|
+
"required": [
|
|
1934
|
+
"context_index",
|
|
1935
|
+
"contextPiece",
|
|
1936
|
+
"relevanceLevel",
|
|
1937
|
+
"wasUsed",
|
|
1938
|
+
"reasoning"
|
|
1939
|
+
]
|
|
1940
|
+
}
|
|
1941
|
+
},
|
|
1942
|
+
"missingContext": {
|
|
1943
|
+
"default": [],
|
|
1944
|
+
"type": "array",
|
|
1945
|
+
"items": {
|
|
1946
|
+
"type": "string"
|
|
1947
|
+
}
|
|
1948
|
+
},
|
|
1949
|
+
"overallAssessment": {
|
|
1950
|
+
"type": "string"
|
|
1951
|
+
}
|
|
1952
|
+
},
|
|
1953
|
+
"required": [
|
|
1954
|
+
"evaluations",
|
|
1955
|
+
"overallAssessment"
|
|
1956
|
+
]
|
|
1957
|
+
};
|
|
1631
1958
|
var DEFAULT_PENALTIES = {
|
|
1632
1959
|
UNUSED_HIGH_RELEVANCE_CONTEXT: 0.1,
|
|
1633
1960
|
// 10% penalty per unused high-relevance context
|
|
@@ -1861,15 +2188,37 @@ Example responses:
|
|
|
1861
2188
|
}
|
|
1862
2189
|
|
|
1863
2190
|
// src/scorers/llm/context-precision/index.ts
|
|
1864
|
-
var contextRelevanceOutputSchema =
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
2191
|
+
var contextRelevanceOutputSchema = {
|
|
2192
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
2193
|
+
"type": "object",
|
|
2194
|
+
"properties": {
|
|
2195
|
+
"verdicts": {
|
|
2196
|
+
"type": "array",
|
|
2197
|
+
"items": {
|
|
2198
|
+
"type": "object",
|
|
2199
|
+
"properties": {
|
|
2200
|
+
"context_index": {
|
|
2201
|
+
"type": "number"
|
|
2202
|
+
},
|
|
2203
|
+
"verdict": {
|
|
2204
|
+
"type": "string"
|
|
2205
|
+
},
|
|
2206
|
+
"reason": {
|
|
2207
|
+
"type": "string"
|
|
2208
|
+
}
|
|
2209
|
+
},
|
|
2210
|
+
"required": [
|
|
2211
|
+
"context_index",
|
|
2212
|
+
"verdict",
|
|
2213
|
+
"reason"
|
|
2214
|
+
]
|
|
2215
|
+
}
|
|
2216
|
+
}
|
|
2217
|
+
},
|
|
2218
|
+
"required": [
|
|
2219
|
+
"verdicts"
|
|
2220
|
+
]
|
|
2221
|
+
};
|
|
1873
2222
|
var getContext2 = ({
|
|
1874
2223
|
input,
|
|
1875
2224
|
output,
|
|
@@ -2148,20 +2497,63 @@ Example responses:
|
|
|
2148
2497
|
}
|
|
2149
2498
|
|
|
2150
2499
|
// src/scorers/llm/noise-sensitivity/index.ts
|
|
2151
|
-
var
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2500
|
+
var analyzeOutputSchema4 = {
|
|
2501
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
2502
|
+
"type": "object",
|
|
2503
|
+
"properties": {
|
|
2504
|
+
"dimensions": {
|
|
2505
|
+
"type": "array",
|
|
2506
|
+
"items": {
|
|
2507
|
+
"type": "object",
|
|
2508
|
+
"properties": {
|
|
2509
|
+
"dimension": {
|
|
2510
|
+
"type": "string"
|
|
2511
|
+
},
|
|
2512
|
+
"impactLevel": {
|
|
2513
|
+
"type": "string",
|
|
2514
|
+
"enum": [
|
|
2515
|
+
"none",
|
|
2516
|
+
"minimal",
|
|
2517
|
+
"moderate",
|
|
2518
|
+
"significant",
|
|
2519
|
+
"severe"
|
|
2520
|
+
]
|
|
2521
|
+
},
|
|
2522
|
+
"specificChanges": {
|
|
2523
|
+
"type": "string"
|
|
2524
|
+
},
|
|
2525
|
+
"noiseInfluence": {
|
|
2526
|
+
"type": "string"
|
|
2527
|
+
}
|
|
2528
|
+
},
|
|
2529
|
+
"required": [
|
|
2530
|
+
"dimension",
|
|
2531
|
+
"impactLevel",
|
|
2532
|
+
"specificChanges",
|
|
2533
|
+
"noiseInfluence"
|
|
2534
|
+
]
|
|
2535
|
+
}
|
|
2536
|
+
},
|
|
2537
|
+
"overallAssessment": {
|
|
2538
|
+
"type": "string"
|
|
2539
|
+
},
|
|
2540
|
+
"majorIssues": {
|
|
2541
|
+
"default": [],
|
|
2542
|
+
"type": "array",
|
|
2543
|
+
"items": {
|
|
2544
|
+
"type": "string"
|
|
2545
|
+
}
|
|
2546
|
+
},
|
|
2547
|
+
"robustnessScore": {
|
|
2548
|
+
"type": "number"
|
|
2549
|
+
}
|
|
2550
|
+
},
|
|
2551
|
+
"required": [
|
|
2552
|
+
"dimensions",
|
|
2553
|
+
"overallAssessment",
|
|
2554
|
+
"robustnessScore"
|
|
2555
|
+
]
|
|
2556
|
+
};
|
|
2165
2557
|
var DEFAULT_IMPACT_WEIGHTS = {
|
|
2166
2558
|
none: 1,
|
|
2167
2559
|
minimal: 0.85,
|
|
@@ -2494,37 +2886,124 @@ Example responses:
|
|
|
2494
2886
|
}
|
|
2495
2887
|
|
|
2496
2888
|
// src/scorers/llm/prompt-alignment/index.ts
|
|
2497
|
-
var
|
|
2498
|
-
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2502
|
-
|
|
2503
|
-
|
|
2504
|
-
|
|
2505
|
-
|
|
2506
|
-
|
|
2507
|
-
|
|
2508
|
-
|
|
2509
|
-
|
|
2510
|
-
|
|
2511
|
-
|
|
2512
|
-
|
|
2513
|
-
|
|
2514
|
-
|
|
2515
|
-
|
|
2516
|
-
|
|
2517
|
-
|
|
2518
|
-
|
|
2519
|
-
|
|
2520
|
-
|
|
2521
|
-
|
|
2522
|
-
|
|
2523
|
-
|
|
2524
|
-
|
|
2525
|
-
|
|
2526
|
-
|
|
2527
|
-
|
|
2889
|
+
var analyzeOutputSchema5 = {
|
|
2890
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
2891
|
+
"type": "object",
|
|
2892
|
+
"properties": {
|
|
2893
|
+
"intentAlignment": {
|
|
2894
|
+
"type": "object",
|
|
2895
|
+
"properties": {
|
|
2896
|
+
"score": {
|
|
2897
|
+
"type": "number"
|
|
2898
|
+
},
|
|
2899
|
+
"primaryIntent": {
|
|
2900
|
+
"type": "string"
|
|
2901
|
+
},
|
|
2902
|
+
"isAddressed": {
|
|
2903
|
+
"type": "boolean"
|
|
2904
|
+
},
|
|
2905
|
+
"reasoning": {
|
|
2906
|
+
"type": "string"
|
|
2907
|
+
}
|
|
2908
|
+
},
|
|
2909
|
+
"required": [
|
|
2910
|
+
"score",
|
|
2911
|
+
"primaryIntent",
|
|
2912
|
+
"isAddressed",
|
|
2913
|
+
"reasoning"
|
|
2914
|
+
]
|
|
2915
|
+
},
|
|
2916
|
+
"requirementsFulfillment": {
|
|
2917
|
+
"type": "object",
|
|
2918
|
+
"properties": {
|
|
2919
|
+
"requirements": {
|
|
2920
|
+
"type": "array",
|
|
2921
|
+
"items": {
|
|
2922
|
+
"type": "object",
|
|
2923
|
+
"properties": {
|
|
2924
|
+
"requirement": {
|
|
2925
|
+
"type": "string"
|
|
2926
|
+
},
|
|
2927
|
+
"isFulfilled": {
|
|
2928
|
+
"type": "boolean"
|
|
2929
|
+
},
|
|
2930
|
+
"reasoning": {
|
|
2931
|
+
"type": "string"
|
|
2932
|
+
}
|
|
2933
|
+
},
|
|
2934
|
+
"required": [
|
|
2935
|
+
"requirement",
|
|
2936
|
+
"isFulfilled",
|
|
2937
|
+
"reasoning"
|
|
2938
|
+
]
|
|
2939
|
+
}
|
|
2940
|
+
},
|
|
2941
|
+
"overallScore": {
|
|
2942
|
+
"type": "number"
|
|
2943
|
+
}
|
|
2944
|
+
},
|
|
2945
|
+
"required": [
|
|
2946
|
+
"requirements",
|
|
2947
|
+
"overallScore"
|
|
2948
|
+
]
|
|
2949
|
+
},
|
|
2950
|
+
"completeness": {
|
|
2951
|
+
"type": "object",
|
|
2952
|
+
"properties": {
|
|
2953
|
+
"score": {
|
|
2954
|
+
"type": "number"
|
|
2955
|
+
},
|
|
2956
|
+
"missingElements": {
|
|
2957
|
+
"type": "array",
|
|
2958
|
+
"items": {
|
|
2959
|
+
"type": "string"
|
|
2960
|
+
}
|
|
2961
|
+
},
|
|
2962
|
+
"reasoning": {
|
|
2963
|
+
"type": "string"
|
|
2964
|
+
}
|
|
2965
|
+
},
|
|
2966
|
+
"required": [
|
|
2967
|
+
"score",
|
|
2968
|
+
"missingElements",
|
|
2969
|
+
"reasoning"
|
|
2970
|
+
]
|
|
2971
|
+
},
|
|
2972
|
+
"responseAppropriateness": {
|
|
2973
|
+
"type": "object",
|
|
2974
|
+
"properties": {
|
|
2975
|
+
"score": {
|
|
2976
|
+
"type": "number"
|
|
2977
|
+
},
|
|
2978
|
+
"formatAlignment": {
|
|
2979
|
+
"type": "boolean"
|
|
2980
|
+
},
|
|
2981
|
+
"toneAlignment": {
|
|
2982
|
+
"type": "boolean"
|
|
2983
|
+
},
|
|
2984
|
+
"reasoning": {
|
|
2985
|
+
"type": "string"
|
|
2986
|
+
}
|
|
2987
|
+
},
|
|
2988
|
+
"required": [
|
|
2989
|
+
"score",
|
|
2990
|
+
"formatAlignment",
|
|
2991
|
+
"toneAlignment",
|
|
2992
|
+
"reasoning"
|
|
2993
|
+
]
|
|
2994
|
+
},
|
|
2995
|
+
"overallAssessment": {
|
|
2996
|
+
"type": "string"
|
|
2997
|
+
}
|
|
2998
|
+
},
|
|
2999
|
+
"required": [
|
|
3000
|
+
"intentAlignment",
|
|
3001
|
+
"requirementsFulfillment",
|
|
3002
|
+
"completeness",
|
|
3003
|
+
"responseAppropriateness",
|
|
3004
|
+
"overallAssessment"
|
|
3005
|
+
]
|
|
3006
|
+
};
|
|
2528
3007
|
var SCORING_WEIGHTS = {
|
|
2529
3008
|
USER: {
|
|
2530
3009
|
INTENT_ALIGNMENT: 0.4,
|
|
@@ -2719,19 +3198,64 @@ Provide a single, concise sentence explaining why this score was given.
|
|
|
2719
3198
|
};
|
|
2720
3199
|
|
|
2721
3200
|
// src/scorers/llm/trajectory/index.ts
|
|
2722
|
-
var analyzeOutputSchema6 =
|
|
2723
|
-
|
|
2724
|
-
|
|
2725
|
-
|
|
2726
|
-
|
|
2727
|
-
|
|
2728
|
-
|
|
2729
|
-
|
|
2730
|
-
|
|
2731
|
-
|
|
2732
|
-
|
|
2733
|
-
|
|
2734
|
-
}
|
|
3201
|
+
var analyzeOutputSchema6 = {
|
|
3202
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3203
|
+
"type": "object",
|
|
3204
|
+
"properties": {
|
|
3205
|
+
"stepEvaluations": {
|
|
3206
|
+
"type": "array",
|
|
3207
|
+
"items": {
|
|
3208
|
+
"type": "object",
|
|
3209
|
+
"properties": {
|
|
3210
|
+
"stepName": {
|
|
3211
|
+
"type": "string",
|
|
3212
|
+
"description": "Name of the step (tool name or action)"
|
|
3213
|
+
},
|
|
3214
|
+
"wasNecessary": {
|
|
3215
|
+
"type": "boolean",
|
|
3216
|
+
"description": "Whether this step was necessary for the task"
|
|
3217
|
+
},
|
|
3218
|
+
"wasInOrder": {
|
|
3219
|
+
"type": "boolean",
|
|
3220
|
+
"description": "Whether this step was in a logical position in the sequence"
|
|
3221
|
+
},
|
|
3222
|
+
"reasoning": {
|
|
3223
|
+
"type": "string",
|
|
3224
|
+
"description": "Brief explanation of the evaluation"
|
|
3225
|
+
}
|
|
3226
|
+
},
|
|
3227
|
+
"required": [
|
|
3228
|
+
"stepName",
|
|
3229
|
+
"wasNecessary",
|
|
3230
|
+
"wasInOrder",
|
|
3231
|
+
"reasoning"
|
|
3232
|
+
]
|
|
3233
|
+
}
|
|
3234
|
+
},
|
|
3235
|
+
"missingSteps": {
|
|
3236
|
+
"description": "Steps that should have been taken but were not",
|
|
3237
|
+
"type": "array",
|
|
3238
|
+
"items": {
|
|
3239
|
+
"type": "string"
|
|
3240
|
+
}
|
|
3241
|
+
},
|
|
3242
|
+
"extraSteps": {
|
|
3243
|
+
"description": "Steps that were unnecessary or redundant",
|
|
3244
|
+
"type": "array",
|
|
3245
|
+
"items": {
|
|
3246
|
+
"type": "string"
|
|
3247
|
+
}
|
|
3248
|
+
},
|
|
3249
|
+
"overallAssessment": {
|
|
3250
|
+
"type": "string",
|
|
3251
|
+
"description": "Brief overall assessment of the trajectory quality"
|
|
3252
|
+
}
|
|
3253
|
+
},
|
|
3254
|
+
"required": [
|
|
3255
|
+
"stepEvaluations",
|
|
3256
|
+
"overallAssessment"
|
|
3257
|
+
]
|
|
3258
|
+
};
|
|
2735
3259
|
function formatStepDetails(step) {
|
|
2736
3260
|
switch (step.stepType) {
|
|
2737
3261
|
case "tool_call":
|