@agentv/eval 3.4.0 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -153,8 +153,6 @@ declare const MessageSchema: z.ZodObject<{
153
153
  * strings. Structured fields (`input`, `output`, `expectedOutput`) are always `Message[]`.
154
154
  */
155
155
  declare const CodeGraderInputSchema: z.ZodObject<{
156
- /** @deprecated Use `inputText` instead. First user message content as string. */
157
- question: z.ZodString;
158
156
  criteria: z.ZodString;
159
157
  expectedOutput: z.ZodArray<z.ZodObject<{
160
158
  role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
@@ -224,10 +222,8 @@ declare const CodeGraderInputSchema: z.ZodObject<{
224
222
  name?: string | undefined;
225
223
  metadata?: Record<string, unknown> | undefined;
226
224
  }>, "many">;
227
- /** @deprecated Use `expectedOutputText` instead. Expected output content as string. */
228
- referenceAnswer: z.ZodOptional<z.ZodString>;
229
- /** @deprecated Use `outputText` instead. Last assistant message content as string. */
230
- answer: z.ZodString;
225
+ /** Last assistant message content as string. */
226
+ outputText: z.ZodString;
231
227
  output: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
232
228
  role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
233
229
  content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
@@ -410,11 +406,9 @@ declare const CodeGraderInputSchema: z.ZodObject<{
410
406
  fileChanges: z.ZodOptional<z.ZodNullable<z.ZodString>>;
411
407
  workspacePath: z.ZodOptional<z.ZodNullable<z.ZodString>>;
412
408
  config: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
413
- /** First user message content as string. Replaces `question`. */
414
- inputText: z.ZodOptional<z.ZodString>;
415
- /** Last assistant message content as string. Replaces `answer`. */
416
- outputText: z.ZodOptional<z.ZodString>;
417
- /** Expected output content as string. Replaces `referenceAnswer`. */
409
+ /** First user message content as string. */
410
+ inputText: z.ZodString;
411
+ /** Expected output content as string. */
418
412
  expectedOutputText: z.ZodOptional<z.ZodString>;
419
413
  }, "strip", z.ZodTypeAny, {
420
414
  input: {
@@ -435,7 +429,6 @@ declare const CodeGraderInputSchema: z.ZodObject<{
435
429
  name?: string | undefined;
436
430
  metadata?: Record<string, unknown> | undefined;
437
431
  }[];
438
- question: string;
439
432
  criteria: string;
440
433
  expectedOutput: {
441
434
  role: "tool" | "assistant" | "user" | "system";
@@ -455,9 +448,10 @@ declare const CodeGraderInputSchema: z.ZodObject<{
455
448
  name?: string | undefined;
456
449
  metadata?: Record<string, unknown> | undefined;
457
450
  }[];
458
- answer: string;
451
+ outputText: string;
459
452
  guidelineFiles: string[];
460
453
  inputFiles: string[];
454
+ inputText: string;
461
455
  output?: {
462
456
  role: "tool" | "assistant" | "user" | "system";
463
457
  startTime?: string | undefined;
@@ -479,7 +473,6 @@ declare const CodeGraderInputSchema: z.ZodObject<{
479
473
  startTime?: string | null | undefined;
480
474
  endTime?: string | null | undefined;
481
475
  durationMs?: number | null | undefined;
482
- referenceAnswer?: string | undefined;
483
476
  outputPath?: string | undefined;
484
477
  trace?: {
485
478
  eventCount: number;
@@ -498,8 +491,6 @@ declare const CodeGraderInputSchema: z.ZodObject<{
498
491
  fileChanges?: string | null | undefined;
499
492
  workspacePath?: string | null | undefined;
500
493
  config?: Record<string, unknown> | null | undefined;
501
- inputText?: string | undefined;
502
- outputText?: string | undefined;
503
494
  expectedOutputText?: string | undefined;
504
495
  }, {
505
496
  input: {
@@ -520,7 +511,6 @@ declare const CodeGraderInputSchema: z.ZodObject<{
520
511
  name?: string | undefined;
521
512
  metadata?: Record<string, unknown> | undefined;
522
513
  }[];
523
- question: string;
524
514
  criteria: string;
525
515
  expectedOutput: {
526
516
  role: "tool" | "assistant" | "user" | "system";
@@ -540,9 +530,10 @@ declare const CodeGraderInputSchema: z.ZodObject<{
540
530
  name?: string | undefined;
541
531
  metadata?: Record<string, unknown> | undefined;
542
532
  }[];
543
- answer: string;
533
+ outputText: string;
544
534
  guidelineFiles: string[];
545
535
  inputFiles: string[];
536
+ inputText: string;
546
537
  output?: {
547
538
  role: "tool" | "assistant" | "user" | "system";
548
539
  startTime?: string | undefined;
@@ -564,7 +555,6 @@ declare const CodeGraderInputSchema: z.ZodObject<{
564
555
  startTime?: string | null | undefined;
565
556
  endTime?: string | null | undefined;
566
557
  durationMs?: number | null | undefined;
567
- referenceAnswer?: string | undefined;
568
558
  outputPath?: string | undefined;
569
559
  trace?: {
570
560
  eventCount: number;
@@ -583,8 +573,6 @@ declare const CodeGraderInputSchema: z.ZodObject<{
583
573
  fileChanges?: string | null | undefined;
584
574
  workspacePath?: string | null | undefined;
585
575
  config?: Record<string, unknown> | null | undefined;
586
- inputText?: string | undefined;
587
- outputText?: string | undefined;
588
576
  expectedOutputText?: string | undefined;
589
577
  }>;
590
578
  /**
@@ -592,22 +580,36 @@ declare const CodeGraderInputSchema: z.ZodObject<{
592
580
  */
593
581
  declare const CodeGraderResultSchema: z.ZodObject<{
594
582
  score: z.ZodNumber;
595
- hits: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
596
- misses: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
597
- reasoning: z.ZodOptional<z.ZodString>;
583
+ assertions: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodObject<{
584
+ text: z.ZodString;
585
+ passed: z.ZodBoolean;
586
+ evidence: z.ZodOptional<z.ZodString>;
587
+ }, "strip", z.ZodTypeAny, {
588
+ text: string;
589
+ passed: boolean;
590
+ evidence?: string | undefined;
591
+ }, {
592
+ text: string;
593
+ passed: boolean;
594
+ evidence?: string | undefined;
595
+ }>, "many">>>;
598
596
  /** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */
599
597
  details: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
600
598
  }, "strip", z.ZodTypeAny, {
601
599
  score: number;
602
- hits: string[];
603
- misses: string[];
604
- reasoning?: string | undefined;
600
+ assertions: {
601
+ text: string;
602
+ passed: boolean;
603
+ evidence?: string | undefined;
604
+ }[];
605
605
  details?: Record<string, unknown> | undefined;
606
606
  }, {
607
607
  score: number;
608
- hits?: string[] | undefined;
609
- misses?: string[] | undefined;
610
- reasoning?: string | undefined;
608
+ assertions?: {
609
+ text: string;
610
+ passed: boolean;
611
+ evidence?: string | undefined;
612
+ }[] | undefined;
611
613
  details?: Record<string, unknown> | undefined;
612
614
  }>;
613
615
  /**
@@ -618,19 +620,15 @@ type CodeGraderResult = z.infer<typeof CodeGraderResultSchema>;
618
620
  /**
619
621
  * CodeGraderInput after `enrichInput()` has run.
620
622
  *
621
- * The text convenience accessors (`inputText`, `outputText`, `expectedOutputText`)
623
+ * The text accessors (`inputText`, `outputText`, `expectedOutputText`)
622
624
  * are always populated by the runtime before the handler is called, so they are
623
625
  * guaranteed to be `string` (never `undefined`).
624
626
  *
625
627
  * Handler function signatures (`CodeGraderHandler`, `AssertionHandler`) use this
626
628
  * type so that user code can destructure `{ outputText }` without null-checks.
627
629
  */
628
- type EnrichedCodeGraderInput = Omit<CodeGraderInput, 'inputText' | 'outputText' | 'expectedOutputText'> & {
629
- /** First user message content as string. Replaces `question`. */
630
- readonly inputText: string;
631
- /** Last assistant message content as string. Replaces `answer`. */
632
- readonly outputText: string;
633
- /** Expected output content as string. Replaces `referenceAnswer`. */
630
+ type EnrichedCodeGraderInput = Omit<CodeGraderInput, 'expectedOutputText'> & {
631
+ /** Expected output content as string. */
634
632
  readonly expectedOutputText: string;
635
633
  };
636
634
  type TraceSummary = z.infer<typeof TraceSummarySchema>;
@@ -642,8 +640,6 @@ type TokenUsage = z.infer<typeof TokenUsageSchema>;
642
640
  * Uses the same schema as CodeGraderInput since the orchestrator sends identical payloads.
643
641
  */
644
642
  declare const PromptTemplateInputSchema: z.ZodObject<{
645
- /** @deprecated Use `inputText` instead. First user message content as string. */
646
- question: z.ZodString;
647
643
  criteria: z.ZodString;
648
644
  expectedOutput: z.ZodArray<z.ZodObject<{
649
645
  role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
@@ -713,10 +709,8 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
713
709
  name?: string | undefined;
714
710
  metadata?: Record<string, unknown> | undefined;
715
711
  }>, "many">;
716
- /** @deprecated Use `expectedOutputText` instead. Expected output content as string. */
717
- referenceAnswer: z.ZodOptional<z.ZodString>;
718
- /** @deprecated Use `outputText` instead. Last assistant message content as string. */
719
- answer: z.ZodString;
712
+ /** Last assistant message content as string. */
713
+ outputText: z.ZodString;
720
714
  output: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
721
715
  role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
722
716
  content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
@@ -899,11 +893,9 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
899
893
  fileChanges: z.ZodOptional<z.ZodNullable<z.ZodString>>;
900
894
  workspacePath: z.ZodOptional<z.ZodNullable<z.ZodString>>;
901
895
  config: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
902
- /** First user message content as string. Replaces `question`. */
903
- inputText: z.ZodOptional<z.ZodString>;
904
- /** Last assistant message content as string. Replaces `answer`. */
905
- outputText: z.ZodOptional<z.ZodString>;
906
- /** Expected output content as string. Replaces `referenceAnswer`. */
896
+ /** First user message content as string. */
897
+ inputText: z.ZodString;
898
+ /** Expected output content as string. */
907
899
  expectedOutputText: z.ZodOptional<z.ZodString>;
908
900
  }, "strip", z.ZodTypeAny, {
909
901
  input: {
@@ -924,7 +916,6 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
924
916
  name?: string | undefined;
925
917
  metadata?: Record<string, unknown> | undefined;
926
918
  }[];
927
- question: string;
928
919
  criteria: string;
929
920
  expectedOutput: {
930
921
  role: "tool" | "assistant" | "user" | "system";
@@ -944,9 +935,10 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
944
935
  name?: string | undefined;
945
936
  metadata?: Record<string, unknown> | undefined;
946
937
  }[];
947
- answer: string;
938
+ outputText: string;
948
939
  guidelineFiles: string[];
949
940
  inputFiles: string[];
941
+ inputText: string;
950
942
  output?: {
951
943
  role: "tool" | "assistant" | "user" | "system";
952
944
  startTime?: string | undefined;
@@ -968,7 +960,6 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
968
960
  startTime?: string | null | undefined;
969
961
  endTime?: string | null | undefined;
970
962
  durationMs?: number | null | undefined;
971
- referenceAnswer?: string | undefined;
972
963
  outputPath?: string | undefined;
973
964
  trace?: {
974
965
  eventCount: number;
@@ -987,8 +978,6 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
987
978
  fileChanges?: string | null | undefined;
988
979
  workspacePath?: string | null | undefined;
989
980
  config?: Record<string, unknown> | null | undefined;
990
- inputText?: string | undefined;
991
- outputText?: string | undefined;
992
981
  expectedOutputText?: string | undefined;
993
982
  }, {
994
983
  input: {
@@ -1009,7 +998,6 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
1009
998
  name?: string | undefined;
1010
999
  metadata?: Record<string, unknown> | undefined;
1011
1000
  }[];
1012
- question: string;
1013
1001
  criteria: string;
1014
1002
  expectedOutput: {
1015
1003
  role: "tool" | "assistant" | "user" | "system";
@@ -1029,9 +1017,10 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
1029
1017
  name?: string | undefined;
1030
1018
  metadata?: Record<string, unknown> | undefined;
1031
1019
  }[];
1032
- answer: string;
1020
+ outputText: string;
1033
1021
  guidelineFiles: string[];
1034
1022
  inputFiles: string[];
1023
+ inputText: string;
1035
1024
  output?: {
1036
1025
  role: "tool" | "assistant" | "user" | "system";
1037
1026
  startTime?: string | undefined;
@@ -1053,7 +1042,6 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
1053
1042
  startTime?: string | null | undefined;
1054
1043
  endTime?: string | null | undefined;
1055
1044
  durationMs?: number | null | undefined;
1056
- referenceAnswer?: string | undefined;
1057
1045
  outputPath?: string | undefined;
1058
1046
  trace?: {
1059
1047
  eventCount: number;
@@ -1072,15 +1060,11 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
1072
1060
  fileChanges?: string | null | undefined;
1073
1061
  workspacePath?: string | null | undefined;
1074
1062
  config?: Record<string, unknown> | null | undefined;
1075
- inputText?: string | undefined;
1076
- outputText?: string | undefined;
1077
1063
  expectedOutputText?: string | undefined;
1078
1064
  }>;
1079
1065
  type PromptTemplateInput = CodeGraderInput;
1080
1066
  /** @deprecated Use CodeGraderInputSchema */
1081
1067
  declare const CodeJudgeInputSchema: z.ZodObject<{
1082
- /** @deprecated Use `inputText` instead. First user message content as string. */
1083
- question: z.ZodString;
1084
1068
  criteria: z.ZodString;
1085
1069
  expectedOutput: z.ZodArray<z.ZodObject<{
1086
1070
  role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
@@ -1150,10 +1134,8 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
1150
1134
  name?: string | undefined;
1151
1135
  metadata?: Record<string, unknown> | undefined;
1152
1136
  }>, "many">;
1153
- /** @deprecated Use `expectedOutputText` instead. Expected output content as string. */
1154
- referenceAnswer: z.ZodOptional<z.ZodString>;
1155
- /** @deprecated Use `outputText` instead. Last assistant message content as string. */
1156
- answer: z.ZodString;
1137
+ /** Last assistant message content as string. */
1138
+ outputText: z.ZodString;
1157
1139
  output: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
1158
1140
  role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
1159
1141
  content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
@@ -1336,11 +1318,9 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
1336
1318
  fileChanges: z.ZodOptional<z.ZodNullable<z.ZodString>>;
1337
1319
  workspacePath: z.ZodOptional<z.ZodNullable<z.ZodString>>;
1338
1320
  config: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
1339
- /** First user message content as string. Replaces `question`. */
1340
- inputText: z.ZodOptional<z.ZodString>;
1341
- /** Last assistant message content as string. Replaces `answer`. */
1342
- outputText: z.ZodOptional<z.ZodString>;
1343
- /** Expected output content as string. Replaces `referenceAnswer`. */
1321
+ /** First user message content as string. */
1322
+ inputText: z.ZodString;
1323
+ /** Expected output content as string. */
1344
1324
  expectedOutputText: z.ZodOptional<z.ZodString>;
1345
1325
  }, "strip", z.ZodTypeAny, {
1346
1326
  input: {
@@ -1361,7 +1341,6 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
1361
1341
  name?: string | undefined;
1362
1342
  metadata?: Record<string, unknown> | undefined;
1363
1343
  }[];
1364
- question: string;
1365
1344
  criteria: string;
1366
1345
  expectedOutput: {
1367
1346
  role: "tool" | "assistant" | "user" | "system";
@@ -1381,9 +1360,10 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
1381
1360
  name?: string | undefined;
1382
1361
  metadata?: Record<string, unknown> | undefined;
1383
1362
  }[];
1384
- answer: string;
1363
+ outputText: string;
1385
1364
  guidelineFiles: string[];
1386
1365
  inputFiles: string[];
1366
+ inputText: string;
1387
1367
  output?: {
1388
1368
  role: "tool" | "assistant" | "user" | "system";
1389
1369
  startTime?: string | undefined;
@@ -1405,7 +1385,6 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
1405
1385
  startTime?: string | null | undefined;
1406
1386
  endTime?: string | null | undefined;
1407
1387
  durationMs?: number | null | undefined;
1408
- referenceAnswer?: string | undefined;
1409
1388
  outputPath?: string | undefined;
1410
1389
  trace?: {
1411
1390
  eventCount: number;
@@ -1424,8 +1403,6 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
1424
1403
  fileChanges?: string | null | undefined;
1425
1404
  workspacePath?: string | null | undefined;
1426
1405
  config?: Record<string, unknown> | null | undefined;
1427
- inputText?: string | undefined;
1428
- outputText?: string | undefined;
1429
1406
  expectedOutputText?: string | undefined;
1430
1407
  }, {
1431
1408
  input: {
@@ -1446,7 +1423,6 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
1446
1423
  name?: string | undefined;
1447
1424
  metadata?: Record<string, unknown> | undefined;
1448
1425
  }[];
1449
- question: string;
1450
1426
  criteria: string;
1451
1427
  expectedOutput: {
1452
1428
  role: "tool" | "assistant" | "user" | "system";
@@ -1466,9 +1442,10 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
1466
1442
  name?: string | undefined;
1467
1443
  metadata?: Record<string, unknown> | undefined;
1468
1444
  }[];
1469
- answer: string;
1445
+ outputText: string;
1470
1446
  guidelineFiles: string[];
1471
1447
  inputFiles: string[];
1448
+ inputText: string;
1472
1449
  output?: {
1473
1450
  role: "tool" | "assistant" | "user" | "system";
1474
1451
  startTime?: string | undefined;
@@ -1490,7 +1467,6 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
1490
1467
  startTime?: string | null | undefined;
1491
1468
  endTime?: string | null | undefined;
1492
1469
  durationMs?: number | null | undefined;
1493
- referenceAnswer?: string | undefined;
1494
1470
  outputPath?: string | undefined;
1495
1471
  trace?: {
1496
1472
  eventCount: number;
@@ -1509,29 +1485,41 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
1509
1485
  fileChanges?: string | null | undefined;
1510
1486
  workspacePath?: string | null | undefined;
1511
1487
  config?: Record<string, unknown> | null | undefined;
1512
- inputText?: string | undefined;
1513
- outputText?: string | undefined;
1514
1488
  expectedOutputText?: string | undefined;
1515
1489
  }>;
1516
1490
  /** @deprecated Use CodeGraderResultSchema */
1517
1491
  declare const CodeJudgeResultSchema: z.ZodObject<{
1518
1492
  score: z.ZodNumber;
1519
- hits: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
1520
- misses: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
1521
- reasoning: z.ZodOptional<z.ZodString>;
1493
+ assertions: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodObject<{
1494
+ text: z.ZodString;
1495
+ passed: z.ZodBoolean;
1496
+ evidence: z.ZodOptional<z.ZodString>;
1497
+ }, "strip", z.ZodTypeAny, {
1498
+ text: string;
1499
+ passed: boolean;
1500
+ evidence?: string | undefined;
1501
+ }, {
1502
+ text: string;
1503
+ passed: boolean;
1504
+ evidence?: string | undefined;
1505
+ }>, "many">>>;
1522
1506
  /** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */
1523
1507
  details: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
1524
1508
  }, "strip", z.ZodTypeAny, {
1525
1509
  score: number;
1526
- hits: string[];
1527
- misses: string[];
1528
- reasoning?: string | undefined;
1510
+ assertions: {
1511
+ text: string;
1512
+ passed: boolean;
1513
+ evidence?: string | undefined;
1514
+ }[];
1529
1515
  details?: Record<string, unknown> | undefined;
1530
1516
  }, {
1531
1517
  score: number;
1532
- hits?: string[] | undefined;
1533
- misses?: string[] | undefined;
1534
- reasoning?: string | undefined;
1518
+ assertions?: {
1519
+ text: string;
1520
+ passed: boolean;
1521
+ evidence?: string | undefined;
1522
+ }[] | undefined;
1535
1523
  details?: Record<string, unknown> | undefined;
1536
1524
  }>;
1537
1525
  /** @deprecated Use CodeGraderInput */
@@ -1634,7 +1622,7 @@ declare class TargetInvocationError extends Error {
1634
1622
  *
1635
1623
  * if (!target) {
1636
1624
  * // Target not available - no target config on this evaluator
1637
- * return { score: 0.5, reasoning: 'Target not available' };
1625
+ * return { score: 0.5, assertions: [{ text: 'Target not available', passed: false }] };
1638
1626
  * }
1639
1627
  *
1640
1628
  * const response = await target.invoke({
@@ -1675,19 +1663,23 @@ type AssertionType = 'llm-grader' | 'code-grader' | 'rubrics' | 'composite' | 't
1675
1663
  /**
1676
1664
  * Result returned from an assertion handler.
1677
1665
  *
1678
- * @example Pass with reasoning
1666
+ * @example Pass with score
1679
1667
  * ```ts
1680
- * { pass: true, reasoning: 'Output contains expected keywords' }
1668
+ * { pass: true, assertions: [{ text: 'Output contains expected keywords', passed: true }] }
1681
1669
  * ```
1682
1670
  *
1683
- * @example Fail with misses
1671
+ * @example Fail with evidence
1684
1672
  * ```ts
1685
- * { pass: false, misses: ['Missing required header'], score: 0.3 }
1673
+ * { pass: false, score: 0.3, assertions: [{ text: 'Missing required header', passed: false }] }
1686
1674
  * ```
1687
1675
  *
1688
1676
  * @example Granular score (0-1)
1689
1677
  * ```ts
1690
- * { score: 0.75, hits: ['Format correct', 'Content relevant'], misses: ['Missing citation'] }
1678
+ * { score: 0.75, assertions: [
1679
+ * { text: 'Format correct', passed: true },
1680
+ * { text: 'Content relevant', passed: true },
1681
+ * { text: 'Missing citation', passed: false },
1682
+ * ] }
1691
1683
  * ```
1692
1684
  */
1693
1685
  interface AssertionScore {
@@ -1695,12 +1687,12 @@ interface AssertionScore {
1695
1687
  readonly pass?: boolean;
1696
1688
  /** Numeric score between 0 and 1. Defaults to 1 if pass=true, 0 if pass=false. */
1697
1689
  readonly score?: number;
1698
- /** Aspects that passed. */
1699
- readonly hits?: readonly string[];
1700
- /** Aspects that failed. */
1701
- readonly misses?: readonly string[];
1702
- /** Human-readable explanation. */
1703
- readonly reasoning?: string;
1690
+ /** Per-assertion verdicts with optional evidence. */
1691
+ readonly assertions?: readonly {
1692
+ readonly text: string;
1693
+ readonly passed: boolean;
1694
+ readonly evidence?: string;
1695
+ }[];
1704
1696
  /** Optional structured details for domain-specific metrics. */
1705
1697
  readonly details?: Record<string, unknown>;
1706
1698
  }
@@ -1740,7 +1732,7 @@ type CodeJudgeHandler = CodeGraderHandler;
1740
1732
  *
1741
1733
  * export default defineAssertion(({ outputText }) => ({
1742
1734
  * pass: outputText.includes('hello'),
1743
- * reasoning: 'Checks greeting',
1735
+ * assertions: [{ text: 'Checks greeting', passed: outputText.includes('hello') }],
1744
1736
  * }));
1745
1737
  * ```
1746
1738
  *
@@ -1751,8 +1743,7 @@ type CodeJudgeHandler = CodeGraderHandler;
1751
1743
  *
1752
1744
  * export default defineCodeGrader(({ trace, outputText }) => ({
1753
1745
  * score: trace?.eventCount <= 5 ? 1.0 : 0.5,
1754
- * hits: ['Efficient tool usage'],
1755
- * misses: [],
1746
+ * assertions: [{ text: 'Efficient tool usage', passed: trace?.eventCount <= 5 }],
1756
1747
  * }));
1757
1748
  * ```
1758
1749
  *
@@ -1764,7 +1755,7 @@ type CodeJudgeHandler = CodeGraderHandler;
1764
1755
  * export default defineCodeGrader(async ({ inputText }) => {
1765
1756
  * const target = createTargetClient();
1766
1757
  * if (!target) {
1767
- * return { score: 0, misses: ['Target not available'] };
1758
+ * return { score: 0, assertions: [{ text: 'Target not available', passed: false }] };
1768
1759
  * }
1769
1760
  *
1770
1761
  * const response = await target.invoke({
@@ -1798,14 +1789,13 @@ type CodeJudgeHandler = CodeGraderHandler;
1798
1789
  *
1799
1790
  * export default defineCodeGrader(({ trace }) => {
1800
1791
  * if (!trace) {
1801
- * return { score: 0.5, reasoning: 'No trace available' };
1792
+ * return { score: 0.5, assertions: [{ text: 'No trace available', passed: false }] };
1802
1793
  * }
1803
1794
  *
1804
1795
  * const efficient = trace.eventCount <= 10;
1805
1796
  * return {
1806
1797
  * score: efficient ? 1.0 : 0.5,
1807
- * hits: efficient ? ['Efficient execution'] : [],
1808
- * misses: efficient ? [] : ['Too many tool calls'],
1798
+ * assertions: [{ text: efficient ? 'Efficient execution' : 'Too many tool calls', passed: efficient }],
1809
1799
  * };
1810
1800
  * });
1811
1801
  * ```
@@ -1889,7 +1879,7 @@ declare function definePromptTemplate(handler: PromptTemplateHandler): void;
1889
1879
  *
1890
1880
  * export default defineAssertion(({ outputText }) => ({
1891
1881
  * pass: outputText.toLowerCase().includes('hello'),
1892
- * reasoning: 'Checks for greeting',
1882
+ * assertions: [{ text: 'Checks for greeting', passed: outputText.toLowerCase().includes('hello') }],
1893
1883
  * }));
1894
1884
  * ```
1895
1885
  *
@@ -1902,9 +1892,9 @@ declare function definePromptTemplate(handler: PromptTemplateHandler): void;
1902
1892
  * const isEfficient = (trace?.eventCount ?? 0) <= 5 ? 0.5 : 0;
1903
1893
  * return {
1904
1894
  * score: hasContent + isEfficient,
1905
- * hits: [
1906
- * ...(hasContent ? ['Has content'] : []),
1907
- * ...(isEfficient ? ['Efficient'] : []),
1895
+ * assertions: [
1896
+ * { text: 'Has content', passed: !!hasContent },
1897
+ * { text: 'Efficient', passed: !!isEfficient },
1908
1898
  * ],
1909
1899
  * };
1910
1900
  * });