@mastra/evals 1.2.2 → 1.2.3-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,6 @@
2
2
 
3
3
  var chunk33T2SZZ2_cjs = require('../../chunk-33T2SZZ2.cjs');
4
4
  var evals = require('@mastra/core/evals');
5
- var zod = require('zod');
6
5
  var nlp = require('compromise');
7
6
  var keyword_extractor = require('keyword-extractor');
8
7
  var stringSimilarity = require('string-similarity');
@@ -219,9 +218,21 @@ var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = `
219
218
  5. Empty inputs or error messages should always be marked as "no"
220
219
  6. Responses that discuss the type of information being asked show partial relevance
221
220
  `;
222
- var extractOutputSchema = zod.z.object({
223
- statements: zod.z.array(zod.z.string())
224
- });
221
+ var extractOutputSchema = {
222
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
223
+ "type": "object",
224
+ "properties": {
225
+ "statements": {
226
+ "type": "array",
227
+ "items": {
228
+ "type": "string"
229
+ }
230
+ }
231
+ },
232
+ "required": [
233
+ "statements"
234
+ ]
235
+ };
225
236
  function createAnswerRelevancyScorer({
226
237
  model,
227
238
  options = DEFAULT_OPTIONS
@@ -244,7 +255,33 @@ function createAnswerRelevancyScorer({
244
255
  }
245
256
  }).analyze({
246
257
  description: "Score the relevance of the statements to the input",
247
- outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
258
+ outputSchema: {
259
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
260
+ "type": "object",
261
+ "properties": {
262
+ "results": {
263
+ "type": "array",
264
+ "items": {
265
+ "type": "object",
266
+ "properties": {
267
+ "result": {
268
+ "type": "string"
269
+ },
270
+ "reason": {
271
+ "type": "string"
272
+ }
273
+ },
274
+ "required": [
275
+ "result",
276
+ "reason"
277
+ ]
278
+ }
279
+ }
280
+ },
281
+ "required": [
282
+ "results"
283
+ ]
284
+ },
248
285
  createPrompt: ({ run, results }) => {
249
286
  const input = chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "";
250
287
  return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
@@ -417,28 +454,106 @@ Key Principles:
417
454
  5. Provide actionable feedback for improving answer accuracy
418
455
  6. Be strict but fair - partial credit for partial matches
419
456
  `;
420
- var extractOutputSchema2 = zod.z.object({
421
- outputUnits: zod.z.array(zod.z.string()),
422
- groundTruthUnits: zod.z.array(zod.z.string())
423
- });
424
- var analyzeOutputSchema = zod.z.object({
425
- matches: zod.z.array(
426
- zod.z.object({
427
- groundTruthUnit: zod.z.string(),
428
- outputUnit: zod.z.string().nullable(),
429
- matchType: zod.z.enum(["exact", "semantic", "partial", "missing"]),
430
- explanation: zod.z.string()
431
- })
432
- ),
433
- extraInOutput: zod.z.array(zod.z.string()),
434
- contradictions: zod.z.array(
435
- zod.z.object({
436
- outputUnit: zod.z.string(),
437
- groundTruthUnit: zod.z.string(),
438
- explanation: zod.z.string()
439
- })
440
- )
441
- });
457
+ var extractOutputSchema2 = {
458
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
459
+ "type": "object",
460
+ "properties": {
461
+ "outputUnits": {
462
+ "type": "array",
463
+ "items": {
464
+ "type": "string"
465
+ }
466
+ },
467
+ "groundTruthUnits": {
468
+ "type": "array",
469
+ "items": {
470
+ "type": "string"
471
+ }
472
+ }
473
+ },
474
+ "required": [
475
+ "outputUnits",
476
+ "groundTruthUnits"
477
+ ]
478
+ };
479
+ var analyzeOutputSchema = {
480
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
481
+ "type": "object",
482
+ "properties": {
483
+ "matches": {
484
+ "type": "array",
485
+ "items": {
486
+ "type": "object",
487
+ "properties": {
488
+ "groundTruthUnit": {
489
+ "type": "string"
490
+ },
491
+ "outputUnit": {
492
+ "anyOf": [
493
+ {
494
+ "type": "string"
495
+ },
496
+ {
497
+ "type": "null"
498
+ }
499
+ ]
500
+ },
501
+ "matchType": {
502
+ "type": "string",
503
+ "enum": [
504
+ "exact",
505
+ "semantic",
506
+ "partial",
507
+ "missing"
508
+ ]
509
+ },
510
+ "explanation": {
511
+ "type": "string"
512
+ }
513
+ },
514
+ "required": [
515
+ "groundTruthUnit",
516
+ "outputUnit",
517
+ "matchType",
518
+ "explanation"
519
+ ]
520
+ }
521
+ },
522
+ "extraInOutput": {
523
+ "type": "array",
524
+ "items": {
525
+ "type": "string"
526
+ }
527
+ },
528
+ "contradictions": {
529
+ "type": "array",
530
+ "items": {
531
+ "type": "object",
532
+ "properties": {
533
+ "outputUnit": {
534
+ "type": "string"
535
+ },
536
+ "groundTruthUnit": {
537
+ "type": "string"
538
+ },
539
+ "explanation": {
540
+ "type": "string"
541
+ }
542
+ },
543
+ "required": [
544
+ "outputUnit",
545
+ "groundTruthUnit",
546
+ "explanation"
547
+ ]
548
+ }
549
+ }
550
+ },
551
+ "required": [
552
+ "matches",
553
+ "extraInOutput",
554
+ "contradictions"
555
+ ]
556
+ };
442
557
  function createAnswerSimilarityScorer({
443
558
  model,
444
559
  options = ANSWER_SIMILARITY_DEFAULT_OPTIONS
@@ -717,16 +832,54 @@ function createFaithfulnessScorer({
717
832
  type: "agent"
718
833
  }).preprocess({
719
834
  description: "Extract relevant statements from the LLM output",
720
- outputSchema: zod.z.object({
721
- claims: zod.z.array(zod.z.string())
722
- }),
835
+ outputSchema: {
836
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
837
+ "type": "object",
838
+ "properties": {
839
+ "claims": {
840
+ "type": "array",
841
+ "items": {
842
+ "type": "string"
843
+ }
844
+ }
845
+ },
846
+ "required": [
847
+ "claims"
848
+ ]
849
+ },
723
850
  createPrompt: ({ run }) => {
724
851
  const prompt = createFaithfulnessExtractPrompt({ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
725
852
  return prompt;
726
853
  }
727
854
  }).analyze({
728
855
  description: "Score the relevance of the statements to the input",
729
- outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
856
+ outputSchema: {
857
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
858
+ "type": "object",
859
+ "properties": {
860
+ "verdicts": {
861
+ "type": "array",
862
+ "items": {
863
+ "type": "object",
864
+ "properties": {
865
+ "verdict": {
866
+ "type": "string"
867
+ },
868
+ "reason": {
869
+ "type": "string"
870
+ }
871
+ },
872
+ "required": [
873
+ "verdict",
874
+ "reason"
875
+ ]
876
+ }
877
+ }
878
+ },
879
+ "required": [
880
+ "verdicts"
881
+ ]
882
+ },
730
883
  createPrompt: ({ results, run }) => {
731
884
  const context = options?.context ?? getToolInvocationContext(run.output);
732
885
  const prompt = createFaithfulnessAnalyzePrompt({
@@ -878,13 +1031,51 @@ function createBiasScorer({ model, options }) {
878
1031
  type: "agent"
879
1032
  }).preprocess({
880
1033
  description: "Extract relevant statements from the LLM output",
881
- outputSchema: zod.z.object({
882
- opinions: zod.z.array(zod.z.string())
883
- }),
1034
+ outputSchema: {
1035
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
1036
+ "type": "object",
1037
+ "properties": {
1038
+ "opinions": {
1039
+ "type": "array",
1040
+ "items": {
1041
+ "type": "string"
1042
+ }
1043
+ }
1044
+ },
1045
+ "required": [
1046
+ "opinions"
1047
+ ]
1048
+ },
884
1049
  createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
885
1050
  }).analyze({
886
1051
  description: "Score the relevance of the statements to the input",
887
- outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
1052
+ outputSchema: {
1053
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
1054
+ "type": "object",
1055
+ "properties": {
1056
+ "results": {
1057
+ "type": "array",
1058
+ "items": {
1059
+ "type": "object",
1060
+ "properties": {
1061
+ "result": {
1062
+ "type": "string"
1063
+ },
1064
+ "reason": {
1065
+ "type": "string"
1066
+ }
1067
+ },
1068
+ "required": [
1069
+ "result",
1070
+ "reason"
1071
+ ]
1072
+ }
1073
+ }
1074
+ },
1075
+ "required": [
1076
+ "results"
1077
+ ]
1078
+ },
888
1079
  createPrompt: ({ run, results }) => {
889
1080
  const prompt = createBiasAnalyzePrompt({
890
1081
  output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
@@ -1113,18 +1304,58 @@ function createHallucinationScorer({
1113
1304
  type: "agent"
1114
1305
  }).preprocess({
1115
1306
  description: "Extract all claims from the given output",
1116
- outputSchema: zod.z.object({
1117
- claims: zod.z.array(zod.z.string())
1118
- }),
1307
+ outputSchema: {
1308
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
1309
+ "type": "object",
1310
+ "properties": {
1311
+ "claims": {
1312
+ "type": "array",
1313
+ "items": {
1314
+ "type": "string"
1315
+ }
1316
+ }
1317
+ },
1318
+ "required": [
1319
+ "claims"
1320
+ ]
1321
+ },
1119
1322
  createPrompt: ({ run }) => {
1120
1323
  const prompt = createHallucinationExtractPrompt({ output: chunk33T2SZZ2_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
1121
1324
  return prompt;
1122
1325
  }
1123
1326
  }).analyze({
1124
1327
  description: "Score the relevance of the statements to the input",
1125
- outputSchema: zod.z.object({
1126
- verdicts: zod.z.array(zod.z.object({ statement: zod.z.string(), verdict: zod.z.string(), reason: zod.z.string() }))
1127
- }),
1328
+ outputSchema: {
1329
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
1330
+ "type": "object",
1331
+ "properties": {
1332
+ "verdicts": {
1333
+ "type": "array",
1334
+ "items": {
1335
+ "type": "object",
1336
+ "properties": {
1337
+ "statement": {
1338
+ "type": "string"
1339
+ },
1340
+ "verdict": {
1341
+ "type": "string"
1342
+ },
1343
+ "reason": {
1344
+ "type": "string"
1345
+ }
1346
+ },
1347
+ "required": [
1348
+ "statement",
1349
+ "verdict",
1350
+ "reason"
1351
+ ]
1352
+ }
1353
+ }
1354
+ },
1355
+ "required": [
1356
+ "verdicts"
1357
+ ]
1358
+ },
1128
1359
  createPrompt: async ({ run, results }) => {
1129
1360
  let context;
1130
1361
  if (options?.getContext) {
@@ -1268,7 +1499,33 @@ function createToxicityScorer({
1268
1499
  type: "agent"
1269
1500
  }).analyze({
1270
1501
  description: "Score the relevance of the statements to the input",
1271
- outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
1502
+ outputSchema: {
1503
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
1504
+ "type": "object",
1505
+ "properties": {
1506
+ "verdicts": {
1507
+ "type": "array",
1508
+ "items": {
1509
+ "type": "object",
1510
+ "properties": {
1511
+ "verdict": {
1512
+ "type": "string"
1513
+ },
1514
+ "reason": {
1515
+ "type": "string"
1516
+ }
1517
+ },
1518
+ "required": [
1519
+ "verdict",
1520
+ "reason"
1521
+ ]
1522
+ }
1523
+ }
1524
+ },
1525
+ "required": [
1526
+ "verdicts"
1527
+ ]
1528
+ },
1272
1529
  createPrompt: ({ run }) => {
1273
1530
  const prompt = createToxicityAnalyzePrompt({
1274
1531
  input: chunk33T2SZZ2_cjs.getUserMessageFromRunInput(run.input) ?? "",
@@ -1395,16 +1652,43 @@ Provide a single, concise sentence explaining why this score was given.
1395
1652
  };
1396
1653
 
1397
1654
  // src/scorers/llm/tool-call-accuracy/index.ts
1398
- var analyzeOutputSchema2 = zod.z.object({
1399
- evaluations: zod.z.array(
1400
- zod.z.object({
1401
- toolCalled: zod.z.string(),
1402
- wasAppropriate: zod.z.boolean(),
1403
- reasoning: zod.z.string()
1404
- })
1405
- ),
1406
- missingTools: zod.z.array(zod.z.string()).optional()
1407
- });
1655
+ var analyzeOutputSchema2 = {
1656
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
1657
+ "type": "object",
1658
+ "properties": {
1659
+ "evaluations": {
1660
+ "type": "array",
1661
+ "items": {
1662
+ "type": "object",
1663
+ "properties": {
1664
+ "toolCalled": {
1665
+ "type": "string"
1666
+ },
1667
+ "wasAppropriate": {
1668
+ "type": "boolean"
1669
+ },
1670
+ "reasoning": {
1671
+ "type": "string"
1672
+ }
1673
+ },
1674
+ "required": [
1675
+ "toolCalled",
1676
+ "wasAppropriate",
1677
+ "reasoning"
1678
+ ]
1679
+ }
1680
+ },
1681
+ "missingTools": {
1682
+ "type": "array",
1683
+ "items": {
1684
+ "type": "string"
1685
+ }
1686
+ }
1687
+ },
1688
+ "required": [
1689
+ "evaluations"
1690
+ ]
1691
+ };
1408
1692
  function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1409
1693
  const toolDefinitions = availableTools.map((tool) => `${tool.id}: ${tool.description}`).join("\n");
1410
1694
  return evals.createScorer({
@@ -1615,19 +1899,62 @@ Example responses:
1615
1899
  }
1616
1900
 
1617
1901
  // src/scorers/llm/context-relevance/index.ts
1618
- var analyzeOutputSchema3 = zod.z.object({
1619
- evaluations: zod.z.array(
1620
- zod.z.object({
1621
- context_index: zod.z.number(),
1622
- contextPiece: zod.z.string(),
1623
- relevanceLevel: zod.z.enum(["high", "medium", "low", "none"]),
1624
- wasUsed: zod.z.boolean(),
1625
- reasoning: zod.z.string()
1626
- })
1627
- ),
1628
- missingContext: zod.z.array(zod.z.string()).optional().default([]),
1629
- overallAssessment: zod.z.string()
1630
- });
1902
+ var analyzeOutputSchema3 = {
1903
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
1904
+ "type": "object",
1905
+ "properties": {
1906
+ "evaluations": {
1907
+ "type": "array",
1908
+ "items": {
1909
+ "type": "object",
1910
+ "properties": {
1911
+ "context_index": {
1912
+ "type": "number"
1913
+ },
1914
+ "contextPiece": {
1915
+ "type": "string"
1916
+ },
1917
+ "relevanceLevel": {
1918
+ "type": "string",
1919
+ "enum": [
1920
+ "high",
1921
+ "medium",
1922
+ "low",
1923
+ "none"
1924
+ ]
1925
+ },
1926
+ "wasUsed": {
1927
+ "type": "boolean"
1928
+ },
1929
+ "reasoning": {
1930
+ "type": "string"
1931
+ }
1932
+ },
1933
+ "required": [
1934
+ "context_index",
1935
+ "contextPiece",
1936
+ "relevanceLevel",
1937
+ "wasUsed",
1938
+ "reasoning"
1939
+ ]
1940
+ }
1941
+ },
1942
+ "missingContext": {
1943
+ "default": [],
1944
+ "type": "array",
1945
+ "items": {
1946
+ "type": "string"
1947
+ }
1948
+ },
1949
+ "overallAssessment": {
1950
+ "type": "string"
1951
+ }
1952
+ },
1953
+ "required": [
1954
+ "evaluations",
1955
+ "overallAssessment"
1956
+ ]
1957
+ };
1631
1958
  var DEFAULT_PENALTIES = {
1632
1959
  UNUSED_HIGH_RELEVANCE_CONTEXT: 0.1,
1633
1960
  // 10% penalty per unused high-relevance context
@@ -1861,15 +2188,37 @@ Example responses:
1861
2188
  }
1862
2189
 
1863
2190
  // src/scorers/llm/context-precision/index.ts
1864
- var contextRelevanceOutputSchema = zod.z.object({
1865
- verdicts: zod.z.array(
1866
- zod.z.object({
1867
- context_index: zod.z.number(),
1868
- verdict: zod.z.string(),
1869
- reason: zod.z.string()
1870
- })
1871
- )
1872
- });
2191
+ var contextRelevanceOutputSchema = {
2192
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
2193
+ "type": "object",
2194
+ "properties": {
2195
+ "verdicts": {
2196
+ "type": "array",
2197
+ "items": {
2198
+ "type": "object",
2199
+ "properties": {
2200
+ "context_index": {
2201
+ "type": "number"
2202
+ },
2203
+ "verdict": {
2204
+ "type": "string"
2205
+ },
2206
+ "reason": {
2207
+ "type": "string"
2208
+ }
2209
+ },
2210
+ "required": [
2211
+ "context_index",
2212
+ "verdict",
2213
+ "reason"
2214
+ ]
2215
+ }
2216
+ }
2217
+ },
2218
+ "required": [
2219
+ "verdicts"
2220
+ ]
2221
+ };
1873
2222
  var getContext2 = ({
1874
2223
  input,
1875
2224
  output,
@@ -2148,20 +2497,63 @@ Example responses:
2148
2497
  }
2149
2498
 
2150
2499
  // src/scorers/llm/noise-sensitivity/index.ts
2151
- var scoreSchema = zod.z.number().refine((n) => n >= 0 && n <= 1, { message: "Score must be between 0 and 1" });
2152
- var analyzeOutputSchema4 = zod.z.object({
2153
- dimensions: zod.z.array(
2154
- zod.z.object({
2155
- dimension: zod.z.string(),
2156
- impactLevel: zod.z.enum(["none", "minimal", "moderate", "significant", "severe"]),
2157
- specificChanges: zod.z.string(),
2158
- noiseInfluence: zod.z.string()
2159
- })
2160
- ),
2161
- overallAssessment: zod.z.string(),
2162
- majorIssues: zod.z.array(zod.z.string()).optional().default([]),
2163
- robustnessScore: scoreSchema
2164
- });
2500
+ var analyzeOutputSchema4 = {
2501
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
2502
+ "type": "object",
2503
+ "properties": {
2504
+ "dimensions": {
2505
+ "type": "array",
2506
+ "items": {
2507
+ "type": "object",
2508
+ "properties": {
2509
+ "dimension": {
2510
+ "type": "string"
2511
+ },
2512
+ "impactLevel": {
2513
+ "type": "string",
2514
+ "enum": [
2515
+ "none",
2516
+ "minimal",
2517
+ "moderate",
2518
+ "significant",
2519
+ "severe"
2520
+ ]
2521
+ },
2522
+ "specificChanges": {
2523
+ "type": "string"
2524
+ },
2525
+ "noiseInfluence": {
2526
+ "type": "string"
2527
+ }
2528
+ },
2529
+ "required": [
2530
+ "dimension",
2531
+ "impactLevel",
2532
+ "specificChanges",
2533
+ "noiseInfluence"
2534
+ ]
2535
+ }
2536
+ },
2537
+ "overallAssessment": {
2538
+ "type": "string"
2539
+ },
2540
+ "majorIssues": {
2541
+ "default": [],
2542
+ "type": "array",
2543
+ "items": {
2544
+ "type": "string"
2545
+ }
2546
+ },
2547
+ "robustnessScore": {
2548
+ "type": "number"
2549
+ }
2550
+ },
2551
+ "required": [
2552
+ "dimensions",
2553
+ "overallAssessment",
2554
+ "robustnessScore"
2555
+ ]
2556
+ };
2165
2557
  var DEFAULT_IMPACT_WEIGHTS = {
2166
2558
  none: 1,
2167
2559
  minimal: 0.85,
@@ -2494,37 +2886,124 @@ Example responses:
2494
2886
  }
2495
2887
 
2496
2888
  // src/scorers/llm/prompt-alignment/index.ts
2497
- var scoreSchema2 = zod.z.number().refine((n) => n >= 0 && n <= 1, { message: "Score must be between 0 and 1" });
2498
- var analyzeOutputSchema5 = zod.z.object({
2499
- intentAlignment: zod.z.object({
2500
- score: scoreSchema2,
2501
- primaryIntent: zod.z.string(),
2502
- isAddressed: zod.z.boolean(),
2503
- reasoning: zod.z.string()
2504
- }),
2505
- requirementsFulfillment: zod.z.object({
2506
- requirements: zod.z.array(
2507
- zod.z.object({
2508
- requirement: zod.z.string(),
2509
- isFulfilled: zod.z.boolean(),
2510
- reasoning: zod.z.string()
2511
- })
2512
- ),
2513
- overallScore: scoreSchema2
2514
- }),
2515
- completeness: zod.z.object({
2516
- score: scoreSchema2,
2517
- missingElements: zod.z.array(zod.z.string()),
2518
- reasoning: zod.z.string()
2519
- }),
2520
- responseAppropriateness: zod.z.object({
2521
- score: scoreSchema2,
2522
- formatAlignment: zod.z.boolean(),
2523
- toneAlignment: zod.z.boolean(),
2524
- reasoning: zod.z.string()
2525
- }),
2526
- overallAssessment: zod.z.string()
2527
- });
2889
+ var analyzeOutputSchema5 = {
2890
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
2891
+ "type": "object",
2892
+ "properties": {
2893
+ "intentAlignment": {
2894
+ "type": "object",
2895
+ "properties": {
2896
+ "score": {
2897
+ "type": "number"
2898
+ },
2899
+ "primaryIntent": {
2900
+ "type": "string"
2901
+ },
2902
+ "isAddressed": {
2903
+ "type": "boolean"
2904
+ },
2905
+ "reasoning": {
2906
+ "type": "string"
2907
+ }
2908
+ },
2909
+ "required": [
2910
+ "score",
2911
+ "primaryIntent",
2912
+ "isAddressed",
2913
+ "reasoning"
2914
+ ]
2915
+ },
2916
+ "requirementsFulfillment": {
2917
+ "type": "object",
2918
+ "properties": {
2919
+ "requirements": {
2920
+ "type": "array",
2921
+ "items": {
2922
+ "type": "object",
2923
+ "properties": {
2924
+ "requirement": {
2925
+ "type": "string"
2926
+ },
2927
+ "isFulfilled": {
2928
+ "type": "boolean"
2929
+ },
2930
+ "reasoning": {
2931
+ "type": "string"
2932
+ }
2933
+ },
2934
+ "required": [
2935
+ "requirement",
2936
+ "isFulfilled",
2937
+ "reasoning"
2938
+ ]
2939
+ }
2940
+ },
2941
+ "overallScore": {
2942
+ "type": "number"
2943
+ }
2944
+ },
2945
+ "required": [
2946
+ "requirements",
2947
+ "overallScore"
2948
+ ]
2949
+ },
2950
+ "completeness": {
2951
+ "type": "object",
2952
+ "properties": {
2953
+ "score": {
2954
+ "type": "number"
2955
+ },
2956
+ "missingElements": {
2957
+ "type": "array",
2958
+ "items": {
2959
+ "type": "string"
2960
+ }
2961
+ },
2962
+ "reasoning": {
2963
+ "type": "string"
2964
+ }
2965
+ },
2966
+ "required": [
2967
+ "score",
2968
+ "missingElements",
2969
+ "reasoning"
2970
+ ]
2971
+ },
2972
+ "responseAppropriateness": {
2973
+ "type": "object",
2974
+ "properties": {
2975
+ "score": {
2976
+ "type": "number"
2977
+ },
2978
+ "formatAlignment": {
2979
+ "type": "boolean"
2980
+ },
2981
+ "toneAlignment": {
2982
+ "type": "boolean"
2983
+ },
2984
+ "reasoning": {
2985
+ "type": "string"
2986
+ }
2987
+ },
2988
+ "required": [
2989
+ "score",
2990
+ "formatAlignment",
2991
+ "toneAlignment",
2992
+ "reasoning"
2993
+ ]
2994
+ },
2995
+ "overallAssessment": {
2996
+ "type": "string"
2997
+ }
2998
+ },
2999
+ "required": [
3000
+ "intentAlignment",
3001
+ "requirementsFulfillment",
3002
+ "completeness",
3003
+ "responseAppropriateness",
3004
+ "overallAssessment"
3005
+ ]
3006
+ };
2528
3007
  var SCORING_WEIGHTS = {
2529
3008
  USER: {
2530
3009
  INTENT_ALIGNMENT: 0.4,
@@ -2719,19 +3198,64 @@ Provide a single, concise sentence explaining why this score was given.
2719
3198
  };
2720
3199
 
2721
3200
  // src/scorers/llm/trajectory/index.ts
2722
- var analyzeOutputSchema6 = zod.z.object({
2723
- stepEvaluations: zod.z.array(
2724
- zod.z.object({
2725
- stepName: zod.z.string().describe("Name of the step (tool name or action)"),
2726
- wasNecessary: zod.z.boolean().describe("Whether this step was necessary for the task"),
2727
- wasInOrder: zod.z.boolean().describe("Whether this step was in a logical position in the sequence"),
2728
- reasoning: zod.z.string().describe("Brief explanation of the evaluation")
2729
- })
2730
- ),
2731
- missingSteps: zod.z.array(zod.z.string()).optional().describe("Steps that should have been taken but were not"),
2732
- extraSteps: zod.z.array(zod.z.string()).optional().describe("Steps that were unnecessary or redundant"),
2733
- overallAssessment: zod.z.string().describe("Brief overall assessment of the trajectory quality")
2734
- });
3201
+ var analyzeOutputSchema6 = {
3202
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3203
+ "type": "object",
3204
+ "properties": {
3205
+ "stepEvaluations": {
3206
+ "type": "array",
3207
+ "items": {
3208
+ "type": "object",
3209
+ "properties": {
3210
+ "stepName": {
3211
+ "type": "string",
3212
+ "description": "Name of the step (tool name or action)"
3213
+ },
3214
+ "wasNecessary": {
3215
+ "type": "boolean",
3216
+ "description": "Whether this step was necessary for the task"
3217
+ },
3218
+ "wasInOrder": {
3219
+ "type": "boolean",
3220
+ "description": "Whether this step was in a logical position in the sequence"
3221
+ },
3222
+ "reasoning": {
3223
+ "type": "string",
3224
+ "description": "Brief explanation of the evaluation"
3225
+ }
3226
+ },
3227
+ "required": [
3228
+ "stepName",
3229
+ "wasNecessary",
3230
+ "wasInOrder",
3231
+ "reasoning"
3232
+ ]
3233
+ }
3234
+ },
3235
+ "missingSteps": {
3236
+ "description": "Steps that should have been taken but were not",
3237
+ "type": "array",
3238
+ "items": {
3239
+ "type": "string"
3240
+ }
3241
+ },
3242
+ "extraSteps": {
3243
+ "description": "Steps that were unnecessary or redundant",
3244
+ "type": "array",
3245
+ "items": {
3246
+ "type": "string"
3247
+ }
3248
+ },
3249
+ "overallAssessment": {
3250
+ "type": "string",
3251
+ "description": "Brief overall assessment of the trajectory quality"
3252
+ }
3253
+ },
3254
+ "required": [
3255
+ "stepEvaluations",
3256
+ "overallAssessment"
3257
+ ]
3258
+ };
2735
3259
  function formatStepDetails(step) {
2736
3260
  switch (step.stepType) {
2737
3261
  case "tool_call":