@mastra/evals 1.2.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,5 @@
1
1
  import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures, isScorerRunInputForAgent, isScorerRunOutputForAgent } from '../../chunk-ZRHCSFKL.js';
2
2
  import { createScorer } from '@mastra/core/evals';
3
- import { z } from 'zod';
4
3
  import nlp from 'compromise';
5
4
  import keyword_extractor from 'keyword-extractor';
6
5
  import stringSimilarity from 'string-similarity';
@@ -210,9 +209,21 @@ var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = `
210
209
  5. Empty inputs or error messages should always be marked as "no"
211
210
  6. Responses that discuss the type of information being asked show partial relevance
212
211
  `;
213
- var extractOutputSchema = z.object({
214
- statements: z.array(z.string())
215
- });
212
+ var extractOutputSchema = {
213
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
214
+ "type": "object",
215
+ "properties": {
216
+ "statements": {
217
+ "type": "array",
218
+ "items": {
219
+ "type": "string"
220
+ }
221
+ }
222
+ },
223
+ "required": [
224
+ "statements"
225
+ ]
226
+ };
216
227
  function createAnswerRelevancyScorer({
217
228
  model,
218
229
  options = DEFAULT_OPTIONS
@@ -235,7 +246,33 @@ function createAnswerRelevancyScorer({
235
246
  }
236
247
  }).analyze({
237
248
  description: "Score the relevance of the statements to the input",
238
- outputSchema: z.object({ results: z.array(z.object({ result: z.string(), reason: z.string() })) }),
249
+ outputSchema: {
250
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
251
+ "type": "object",
252
+ "properties": {
253
+ "results": {
254
+ "type": "array",
255
+ "items": {
256
+ "type": "object",
257
+ "properties": {
258
+ "result": {
259
+ "type": "string"
260
+ },
261
+ "reason": {
262
+ "type": "string"
263
+ }
264
+ },
265
+ "required": [
266
+ "result",
267
+ "reason"
268
+ ]
269
+ }
270
+ }
271
+ },
272
+ "required": [
273
+ "results"
274
+ ]
275
+ },
239
276
  createPrompt: ({ run, results }) => {
240
277
  const input = getUserMessageFromRunInput(run.input) ?? "";
241
278
  return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
@@ -408,28 +445,106 @@ Key Principles:
408
445
  5. Provide actionable feedback for improving answer accuracy
409
446
  6. Be strict but fair - partial credit for partial matches
410
447
  `;
411
- var extractOutputSchema2 = z.object({
412
- outputUnits: z.array(z.string()),
413
- groundTruthUnits: z.array(z.string())
414
- });
415
- var analyzeOutputSchema = z.object({
416
- matches: z.array(
417
- z.object({
418
- groundTruthUnit: z.string(),
419
- outputUnit: z.string().nullable(),
420
- matchType: z.enum(["exact", "semantic", "partial", "missing"]),
421
- explanation: z.string()
422
- })
423
- ),
424
- extraInOutput: z.array(z.string()),
425
- contradictions: z.array(
426
- z.object({
427
- outputUnit: z.string(),
428
- groundTruthUnit: z.string(),
429
- explanation: z.string()
430
- })
431
- )
432
- });
448
+ var extractOutputSchema2 = {
449
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
450
+ "type": "object",
451
+ "properties": {
452
+ "outputUnits": {
453
+ "type": "array",
454
+ "items": {
455
+ "type": "string"
456
+ }
457
+ },
458
+ "groundTruthUnits": {
459
+ "type": "array",
460
+ "items": {
461
+ "type": "string"
462
+ }
463
+ }
464
+ },
465
+ "required": [
466
+ "outputUnits",
467
+ "groundTruthUnits"
468
+ ]
469
+ };
470
+ var analyzeOutputSchema = {
471
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
472
+ "type": "object",
473
+ "properties": {
474
+ "matches": {
475
+ "type": "array",
476
+ "items": {
477
+ "type": "object",
478
+ "properties": {
479
+ "groundTruthUnit": {
480
+ "type": "string"
481
+ },
482
+ "outputUnit": {
483
+ "anyOf": [
484
+ {
485
+ "type": "string"
486
+ },
487
+ {
488
+ "type": "null"
489
+ }
490
+ ]
491
+ },
492
+ "matchType": {
493
+ "type": "string",
494
+ "enum": [
495
+ "exact",
496
+ "semantic",
497
+ "partial",
498
+ "missing"
499
+ ]
500
+ },
501
+ "explanation": {
502
+ "type": "string"
503
+ }
504
+ },
505
+ "required": [
506
+ "groundTruthUnit",
507
+ "outputUnit",
508
+ "matchType",
509
+ "explanation"
510
+ ]
511
+ }
512
+ },
513
+ "extraInOutput": {
514
+ "type": "array",
515
+ "items": {
516
+ "type": "string"
517
+ }
518
+ },
519
+ "contradictions": {
520
+ "type": "array",
521
+ "items": {
522
+ "type": "object",
523
+ "properties": {
524
+ "outputUnit": {
525
+ "type": "string"
526
+ },
527
+ "groundTruthUnit": {
528
+ "type": "string"
529
+ },
530
+ "explanation": {
531
+ "type": "string"
532
+ }
533
+ },
534
+ "required": [
535
+ "outputUnit",
536
+ "groundTruthUnit",
537
+ "explanation"
538
+ ]
539
+ }
540
+ }
541
+ },
542
+ "required": [
543
+ "matches",
544
+ "extraInOutput",
545
+ "contradictions"
546
+ ]
547
+ };
433
548
  function createAnswerSimilarityScorer({
434
549
  model,
435
550
  options = ANSWER_SIMILARITY_DEFAULT_OPTIONS
@@ -708,16 +823,54 @@ function createFaithfulnessScorer({
708
823
  type: "agent"
709
824
  }).preprocess({
710
825
  description: "Extract relevant statements from the LLM output",
711
- outputSchema: z.object({
712
- claims: z.array(z.string())
713
- }),
826
+ outputSchema: {
827
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
828
+ "type": "object",
829
+ "properties": {
830
+ "claims": {
831
+ "type": "array",
832
+ "items": {
833
+ "type": "string"
834
+ }
835
+ }
836
+ },
837
+ "required": [
838
+ "claims"
839
+ ]
840
+ },
714
841
  createPrompt: ({ run }) => {
715
842
  const prompt = createFaithfulnessExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" });
716
843
  return prompt;
717
844
  }
718
845
  }).analyze({
719
846
  description: "Score the relevance of the statements to the input",
720
- outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
847
+ outputSchema: {
848
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
849
+ "type": "object",
850
+ "properties": {
851
+ "verdicts": {
852
+ "type": "array",
853
+ "items": {
854
+ "type": "object",
855
+ "properties": {
856
+ "verdict": {
857
+ "type": "string"
858
+ },
859
+ "reason": {
860
+ "type": "string"
861
+ }
862
+ },
863
+ "required": [
864
+ "verdict",
865
+ "reason"
866
+ ]
867
+ }
868
+ }
869
+ },
870
+ "required": [
871
+ "verdicts"
872
+ ]
873
+ },
721
874
  createPrompt: ({ results, run }) => {
722
875
  const context = options?.context ?? getToolInvocationContext(run.output);
723
876
  const prompt = createFaithfulnessAnalyzePrompt({
@@ -869,13 +1022,51 @@ function createBiasScorer({ model, options }) {
869
1022
  type: "agent"
870
1023
  }).preprocess({
871
1024
  description: "Extract relevant statements from the LLM output",
872
- outputSchema: z.object({
873
- opinions: z.array(z.string())
874
- }),
1025
+ outputSchema: {
1026
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
1027
+ "type": "object",
1028
+ "properties": {
1029
+ "opinions": {
1030
+ "type": "array",
1031
+ "items": {
1032
+ "type": "string"
1033
+ }
1034
+ }
1035
+ },
1036
+ "required": [
1037
+ "opinions"
1038
+ ]
1039
+ },
875
1040
  createPrompt: ({ run }) => createBiasExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" })
876
1041
  }).analyze({
877
1042
  description: "Score the relevance of the statements to the input",
878
- outputSchema: z.object({ results: z.array(z.object({ result: z.string(), reason: z.string() })) }),
1043
+ outputSchema: {
1044
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
1045
+ "type": "object",
1046
+ "properties": {
1047
+ "results": {
1048
+ "type": "array",
1049
+ "items": {
1050
+ "type": "object",
1051
+ "properties": {
1052
+ "result": {
1053
+ "type": "string"
1054
+ },
1055
+ "reason": {
1056
+ "type": "string"
1057
+ }
1058
+ },
1059
+ "required": [
1060
+ "result",
1061
+ "reason"
1062
+ ]
1063
+ }
1064
+ }
1065
+ },
1066
+ "required": [
1067
+ "results"
1068
+ ]
1069
+ },
879
1070
  createPrompt: ({ run, results }) => {
880
1071
  const prompt = createBiasAnalyzePrompt({
881
1072
  output: getAssistantMessageFromRunOutput(run.output) ?? "",
@@ -1104,18 +1295,58 @@ function createHallucinationScorer({
1104
1295
  type: "agent"
1105
1296
  }).preprocess({
1106
1297
  description: "Extract all claims from the given output",
1107
- outputSchema: z.object({
1108
- claims: z.array(z.string())
1109
- }),
1298
+ outputSchema: {
1299
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
1300
+ "type": "object",
1301
+ "properties": {
1302
+ "claims": {
1303
+ "type": "array",
1304
+ "items": {
1305
+ "type": "string"
1306
+ }
1307
+ }
1308
+ },
1309
+ "required": [
1310
+ "claims"
1311
+ ]
1312
+ },
1110
1313
  createPrompt: ({ run }) => {
1111
1314
  const prompt = createHallucinationExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" });
1112
1315
  return prompt;
1113
1316
  }
1114
1317
  }).analyze({
1115
1318
  description: "Score the relevance of the statements to the input",
1116
- outputSchema: z.object({
1117
- verdicts: z.array(z.object({ statement: z.string(), verdict: z.string(), reason: z.string() }))
1118
- }),
1319
+ outputSchema: {
1320
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
1321
+ "type": "object",
1322
+ "properties": {
1323
+ "verdicts": {
1324
+ "type": "array",
1325
+ "items": {
1326
+ "type": "object",
1327
+ "properties": {
1328
+ "statement": {
1329
+ "type": "string"
1330
+ },
1331
+ "verdict": {
1332
+ "type": "string"
1333
+ },
1334
+ "reason": {
1335
+ "type": "string"
1336
+ }
1337
+ },
1338
+ "required": [
1339
+ "statement",
1340
+ "verdict",
1341
+ "reason"
1342
+ ]
1343
+ }
1344
+ }
1345
+ },
1346
+ "required": [
1347
+ "verdicts"
1348
+ ]
1349
+ },
1119
1350
  createPrompt: async ({ run, results }) => {
1120
1351
  let context;
1121
1352
  if (options?.getContext) {
@@ -1259,7 +1490,33 @@ function createToxicityScorer({
1259
1490
  type: "agent"
1260
1491
  }).analyze({
1261
1492
  description: "Score the relevance of the statements to the input",
1262
- outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
1493
+ outputSchema: {
1494
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
1495
+ "type": "object",
1496
+ "properties": {
1497
+ "verdicts": {
1498
+ "type": "array",
1499
+ "items": {
1500
+ "type": "object",
1501
+ "properties": {
1502
+ "verdict": {
1503
+ "type": "string"
1504
+ },
1505
+ "reason": {
1506
+ "type": "string"
1507
+ }
1508
+ },
1509
+ "required": [
1510
+ "verdict",
1511
+ "reason"
1512
+ ]
1513
+ }
1514
+ }
1515
+ },
1516
+ "required": [
1517
+ "verdicts"
1518
+ ]
1519
+ },
1263
1520
  createPrompt: ({ run }) => {
1264
1521
  const prompt = createToxicityAnalyzePrompt({
1265
1522
  input: getUserMessageFromRunInput(run.input) ?? "",
@@ -1386,16 +1643,43 @@ Provide a single, concise sentence explaining why this score was given.
1386
1643
  };
1387
1644
 
1388
1645
  // src/scorers/llm/tool-call-accuracy/index.ts
1389
- var analyzeOutputSchema2 = z.object({
1390
- evaluations: z.array(
1391
- z.object({
1392
- toolCalled: z.string(),
1393
- wasAppropriate: z.boolean(),
1394
- reasoning: z.string()
1395
- })
1396
- ),
1397
- missingTools: z.array(z.string()).optional()
1398
- });
1646
+ var analyzeOutputSchema2 = {
1647
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
1648
+ "type": "object",
1649
+ "properties": {
1650
+ "evaluations": {
1651
+ "type": "array",
1652
+ "items": {
1653
+ "type": "object",
1654
+ "properties": {
1655
+ "toolCalled": {
1656
+ "type": "string"
1657
+ },
1658
+ "wasAppropriate": {
1659
+ "type": "boolean"
1660
+ },
1661
+ "reasoning": {
1662
+ "type": "string"
1663
+ }
1664
+ },
1665
+ "required": [
1666
+ "toolCalled",
1667
+ "wasAppropriate",
1668
+ "reasoning"
1669
+ ]
1670
+ }
1671
+ },
1672
+ "missingTools": {
1673
+ "type": "array",
1674
+ "items": {
1675
+ "type": "string"
1676
+ }
1677
+ }
1678
+ },
1679
+ "required": [
1680
+ "evaluations"
1681
+ ]
1682
+ };
1399
1683
  function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1400
1684
  const toolDefinitions = availableTools.map((tool) => `${tool.id}: ${tool.description}`).join("\n");
1401
1685
  return createScorer({
@@ -1606,19 +1890,62 @@ Example responses:
1606
1890
  }
1607
1891
 
1608
1892
  // src/scorers/llm/context-relevance/index.ts
1609
- var analyzeOutputSchema3 = z.object({
1610
- evaluations: z.array(
1611
- z.object({
1612
- context_index: z.number(),
1613
- contextPiece: z.string(),
1614
- relevanceLevel: z.enum(["high", "medium", "low", "none"]),
1615
- wasUsed: z.boolean(),
1616
- reasoning: z.string()
1617
- })
1618
- ),
1619
- missingContext: z.array(z.string()).optional().default([]),
1620
- overallAssessment: z.string()
1621
- });
1893
+ var analyzeOutputSchema3 = {
1894
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
1895
+ "type": "object",
1896
+ "properties": {
1897
+ "evaluations": {
1898
+ "type": "array",
1899
+ "items": {
1900
+ "type": "object",
1901
+ "properties": {
1902
+ "context_index": {
1903
+ "type": "number"
1904
+ },
1905
+ "contextPiece": {
1906
+ "type": "string"
1907
+ },
1908
+ "relevanceLevel": {
1909
+ "type": "string",
1910
+ "enum": [
1911
+ "high",
1912
+ "medium",
1913
+ "low",
1914
+ "none"
1915
+ ]
1916
+ },
1917
+ "wasUsed": {
1918
+ "type": "boolean"
1919
+ },
1920
+ "reasoning": {
1921
+ "type": "string"
1922
+ }
1923
+ },
1924
+ "required": [
1925
+ "context_index",
1926
+ "contextPiece",
1927
+ "relevanceLevel",
1928
+ "wasUsed",
1929
+ "reasoning"
1930
+ ]
1931
+ }
1932
+ },
1933
+ "missingContext": {
1934
+ "default": [],
1935
+ "type": "array",
1936
+ "items": {
1937
+ "type": "string"
1938
+ }
1939
+ },
1940
+ "overallAssessment": {
1941
+ "type": "string"
1942
+ }
1943
+ },
1944
+ "required": [
1945
+ "evaluations",
1946
+ "overallAssessment"
1947
+ ]
1948
+ };
1622
1949
  var DEFAULT_PENALTIES = {
1623
1950
  UNUSED_HIGH_RELEVANCE_CONTEXT: 0.1,
1624
1951
  // 10% penalty per unused high-relevance context
@@ -1852,15 +2179,37 @@ Example responses:
1852
2179
  }
1853
2180
 
1854
2181
  // src/scorers/llm/context-precision/index.ts
1855
- var contextRelevanceOutputSchema = z.object({
1856
- verdicts: z.array(
1857
- z.object({
1858
- context_index: z.number(),
1859
- verdict: z.string(),
1860
- reason: z.string()
1861
- })
1862
- )
1863
- });
2182
+ var contextRelevanceOutputSchema = {
2183
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
2184
+ "type": "object",
2185
+ "properties": {
2186
+ "verdicts": {
2187
+ "type": "array",
2188
+ "items": {
2189
+ "type": "object",
2190
+ "properties": {
2191
+ "context_index": {
2192
+ "type": "number"
2193
+ },
2194
+ "verdict": {
2195
+ "type": "string"
2196
+ },
2197
+ "reason": {
2198
+ "type": "string"
2199
+ }
2200
+ },
2201
+ "required": [
2202
+ "context_index",
2203
+ "verdict",
2204
+ "reason"
2205
+ ]
2206
+ }
2207
+ }
2208
+ },
2209
+ "required": [
2210
+ "verdicts"
2211
+ ]
2212
+ };
1864
2213
  var getContext2 = ({
1865
2214
  input,
1866
2215
  output,
@@ -2139,20 +2488,63 @@ Example responses:
2139
2488
  }
2140
2489
 
2141
2490
  // src/scorers/llm/noise-sensitivity/index.ts
2142
- var scoreSchema = z.number().refine((n) => n >= 0 && n <= 1, { message: "Score must be between 0 and 1" });
2143
- var analyzeOutputSchema4 = z.object({
2144
- dimensions: z.array(
2145
- z.object({
2146
- dimension: z.string(),
2147
- impactLevel: z.enum(["none", "minimal", "moderate", "significant", "severe"]),
2148
- specificChanges: z.string(),
2149
- noiseInfluence: z.string()
2150
- })
2151
- ),
2152
- overallAssessment: z.string(),
2153
- majorIssues: z.array(z.string()).optional().default([]),
2154
- robustnessScore: scoreSchema
2155
- });
2491
+ var analyzeOutputSchema4 = {
2492
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
2493
+ "type": "object",
2494
+ "properties": {
2495
+ "dimensions": {
2496
+ "type": "array",
2497
+ "items": {
2498
+ "type": "object",
2499
+ "properties": {
2500
+ "dimension": {
2501
+ "type": "string"
2502
+ },
2503
+ "impactLevel": {
2504
+ "type": "string",
2505
+ "enum": [
2506
+ "none",
2507
+ "minimal",
2508
+ "moderate",
2509
+ "significant",
2510
+ "severe"
2511
+ ]
2512
+ },
2513
+ "specificChanges": {
2514
+ "type": "string"
2515
+ },
2516
+ "noiseInfluence": {
2517
+ "type": "string"
2518
+ }
2519
+ },
2520
+ "required": [
2521
+ "dimension",
2522
+ "impactLevel",
2523
+ "specificChanges",
2524
+ "noiseInfluence"
2525
+ ]
2526
+ }
2527
+ },
2528
+ "overallAssessment": {
2529
+ "type": "string"
2530
+ },
2531
+ "majorIssues": {
2532
+ "default": [],
2533
+ "type": "array",
2534
+ "items": {
2535
+ "type": "string"
2536
+ }
2537
+ },
2538
+ "robustnessScore": {
2539
+ "type": "number"
2540
+ }
2541
+ },
2542
+ "required": [
2543
+ "dimensions",
2544
+ "overallAssessment",
2545
+ "robustnessScore"
2546
+ ]
2547
+ };
2156
2548
  var DEFAULT_IMPACT_WEIGHTS = {
2157
2549
  none: 1,
2158
2550
  minimal: 0.85,
@@ -2485,37 +2877,124 @@ Example responses:
2485
2877
  }
2486
2878
 
2487
2879
  // src/scorers/llm/prompt-alignment/index.ts
2488
- var scoreSchema2 = z.number().refine((n) => n >= 0 && n <= 1, { message: "Score must be between 0 and 1" });
2489
- var analyzeOutputSchema5 = z.object({
2490
- intentAlignment: z.object({
2491
- score: scoreSchema2,
2492
- primaryIntent: z.string(),
2493
- isAddressed: z.boolean(),
2494
- reasoning: z.string()
2495
- }),
2496
- requirementsFulfillment: z.object({
2497
- requirements: z.array(
2498
- z.object({
2499
- requirement: z.string(),
2500
- isFulfilled: z.boolean(),
2501
- reasoning: z.string()
2502
- })
2503
- ),
2504
- overallScore: scoreSchema2
2505
- }),
2506
- completeness: z.object({
2507
- score: scoreSchema2,
2508
- missingElements: z.array(z.string()),
2509
- reasoning: z.string()
2510
- }),
2511
- responseAppropriateness: z.object({
2512
- score: scoreSchema2,
2513
- formatAlignment: z.boolean(),
2514
- toneAlignment: z.boolean(),
2515
- reasoning: z.string()
2516
- }),
2517
- overallAssessment: z.string()
2518
- });
2880
+ var analyzeOutputSchema5 = {
2881
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
2882
+ "type": "object",
2883
+ "properties": {
2884
+ "intentAlignment": {
2885
+ "type": "object",
2886
+ "properties": {
2887
+ "score": {
2888
+ "type": "number"
2889
+ },
2890
+ "primaryIntent": {
2891
+ "type": "string"
2892
+ },
2893
+ "isAddressed": {
2894
+ "type": "boolean"
2895
+ },
2896
+ "reasoning": {
2897
+ "type": "string"
2898
+ }
2899
+ },
2900
+ "required": [
2901
+ "score",
2902
+ "primaryIntent",
2903
+ "isAddressed",
2904
+ "reasoning"
2905
+ ]
2906
+ },
2907
+ "requirementsFulfillment": {
2908
+ "type": "object",
2909
+ "properties": {
2910
+ "requirements": {
2911
+ "type": "array",
2912
+ "items": {
2913
+ "type": "object",
2914
+ "properties": {
2915
+ "requirement": {
2916
+ "type": "string"
2917
+ },
2918
+ "isFulfilled": {
2919
+ "type": "boolean"
2920
+ },
2921
+ "reasoning": {
2922
+ "type": "string"
2923
+ }
2924
+ },
2925
+ "required": [
2926
+ "requirement",
2927
+ "isFulfilled",
2928
+ "reasoning"
2929
+ ]
2930
+ }
2931
+ },
2932
+ "overallScore": {
2933
+ "type": "number"
2934
+ }
2935
+ },
2936
+ "required": [
2937
+ "requirements",
2938
+ "overallScore"
2939
+ ]
2940
+ },
2941
+ "completeness": {
2942
+ "type": "object",
2943
+ "properties": {
2944
+ "score": {
2945
+ "type": "number"
2946
+ },
2947
+ "missingElements": {
2948
+ "type": "array",
2949
+ "items": {
2950
+ "type": "string"
2951
+ }
2952
+ },
2953
+ "reasoning": {
2954
+ "type": "string"
2955
+ }
2956
+ },
2957
+ "required": [
2958
+ "score",
2959
+ "missingElements",
2960
+ "reasoning"
2961
+ ]
2962
+ },
2963
+ "responseAppropriateness": {
2964
+ "type": "object",
2965
+ "properties": {
2966
+ "score": {
2967
+ "type": "number"
2968
+ },
2969
+ "formatAlignment": {
2970
+ "type": "boolean"
2971
+ },
2972
+ "toneAlignment": {
2973
+ "type": "boolean"
2974
+ },
2975
+ "reasoning": {
2976
+ "type": "string"
2977
+ }
2978
+ },
2979
+ "required": [
2980
+ "score",
2981
+ "formatAlignment",
2982
+ "toneAlignment",
2983
+ "reasoning"
2984
+ ]
2985
+ },
2986
+ "overallAssessment": {
2987
+ "type": "string"
2988
+ }
2989
+ },
2990
+ "required": [
2991
+ "intentAlignment",
2992
+ "requirementsFulfillment",
2993
+ "completeness",
2994
+ "responseAppropriateness",
2995
+ "overallAssessment"
2996
+ ]
2997
+ };
2519
2998
  var SCORING_WEIGHTS = {
2520
2999
  USER: {
2521
3000
  INTENT_ALIGNMENT: 0.4,
@@ -2710,19 +3189,64 @@ Provide a single, concise sentence explaining why this score was given.
2710
3189
  };
2711
3190
 
2712
3191
  // src/scorers/llm/trajectory/index.ts
2713
- var analyzeOutputSchema6 = z.object({
2714
- stepEvaluations: z.array(
2715
- z.object({
2716
- stepName: z.string().describe("Name of the step (tool name or action)"),
2717
- wasNecessary: z.boolean().describe("Whether this step was necessary for the task"),
2718
- wasInOrder: z.boolean().describe("Whether this step was in a logical position in the sequence"),
2719
- reasoning: z.string().describe("Brief explanation of the evaluation")
2720
- })
2721
- ),
2722
- missingSteps: z.array(z.string()).optional().describe("Steps that should have been taken but were not"),
2723
- extraSteps: z.array(z.string()).optional().describe("Steps that were unnecessary or redundant"),
2724
- overallAssessment: z.string().describe("Brief overall assessment of the trajectory quality")
2725
- });
3192
+ var analyzeOutputSchema6 = {
3193
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3194
+ "type": "object",
3195
+ "properties": {
3196
+ "stepEvaluations": {
3197
+ "type": "array",
3198
+ "items": {
3199
+ "type": "object",
3200
+ "properties": {
3201
+ "stepName": {
3202
+ "type": "string",
3203
+ "description": "Name of the step (tool name or action)"
3204
+ },
3205
+ "wasNecessary": {
3206
+ "type": "boolean",
3207
+ "description": "Whether this step was necessary for the task"
3208
+ },
3209
+ "wasInOrder": {
3210
+ "type": "boolean",
3211
+ "description": "Whether this step was in a logical position in the sequence"
3212
+ },
3213
+ "reasoning": {
3214
+ "type": "string",
3215
+ "description": "Brief explanation of the evaluation"
3216
+ }
3217
+ },
3218
+ "required": [
3219
+ "stepName",
3220
+ "wasNecessary",
3221
+ "wasInOrder",
3222
+ "reasoning"
3223
+ ]
3224
+ }
3225
+ },
3226
+ "missingSteps": {
3227
+ "description": "Steps that should have been taken but were not",
3228
+ "type": "array",
3229
+ "items": {
3230
+ "type": "string"
3231
+ }
3232
+ },
3233
+ "extraSteps": {
3234
+ "description": "Steps that were unnecessary or redundant",
3235
+ "type": "array",
3236
+ "items": {
3237
+ "type": "string"
3238
+ }
3239
+ },
3240
+ "overallAssessment": {
3241
+ "type": "string",
3242
+ "description": "Brief overall assessment of the trajectory quality"
3243
+ }
3244
+ },
3245
+ "required": [
3246
+ "stepEvaluations",
3247
+ "overallAssessment"
3248
+ ]
3249
+ };
2726
3250
  function formatStepDetails(step) {
2727
3251
  switch (step.stepType) {
2728
3252
  case "tool_call":