@agentv/eval 3.4.0 → 3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/index.cjs +23 -81
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +101 -111
- package/dist/index.d.ts +101 -111
- package/dist/index.js +23 -81
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -153,8 +153,6 @@ declare const MessageSchema: z.ZodObject<{
|
|
|
153
153
|
* strings. Structured fields (`input`, `output`, `expectedOutput`) are always `Message[]`.
|
|
154
154
|
*/
|
|
155
155
|
declare const CodeGraderInputSchema: z.ZodObject<{
|
|
156
|
-
/** @deprecated Use `inputText` instead. First user message content as string. */
|
|
157
|
-
question: z.ZodString;
|
|
158
156
|
criteria: z.ZodString;
|
|
159
157
|
expectedOutput: z.ZodArray<z.ZodObject<{
|
|
160
158
|
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
@@ -224,10 +222,8 @@ declare const CodeGraderInputSchema: z.ZodObject<{
|
|
|
224
222
|
name?: string | undefined;
|
|
225
223
|
metadata?: Record<string, unknown> | undefined;
|
|
226
224
|
}>, "many">;
|
|
227
|
-
/**
|
|
228
|
-
|
|
229
|
-
/** @deprecated Use `outputText` instead. Last assistant message content as string. */
|
|
230
|
-
answer: z.ZodString;
|
|
225
|
+
/** Last assistant message content as string. */
|
|
226
|
+
outputText: z.ZodString;
|
|
231
227
|
output: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
|
|
232
228
|
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
233
229
|
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
@@ -410,11 +406,9 @@ declare const CodeGraderInputSchema: z.ZodObject<{
|
|
|
410
406
|
fileChanges: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
411
407
|
workspacePath: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
412
408
|
config: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
|
|
413
|
-
/** First user message content as string.
|
|
414
|
-
inputText: z.
|
|
415
|
-
/**
|
|
416
|
-
outputText: z.ZodOptional<z.ZodString>;
|
|
417
|
-
/** Expected output content as string. Replaces `referenceAnswer`. */
|
|
409
|
+
/** First user message content as string. */
|
|
410
|
+
inputText: z.ZodString;
|
|
411
|
+
/** Expected output content as string. */
|
|
418
412
|
expectedOutputText: z.ZodOptional<z.ZodString>;
|
|
419
413
|
}, "strip", z.ZodTypeAny, {
|
|
420
414
|
input: {
|
|
@@ -435,7 +429,6 @@ declare const CodeGraderInputSchema: z.ZodObject<{
|
|
|
435
429
|
name?: string | undefined;
|
|
436
430
|
metadata?: Record<string, unknown> | undefined;
|
|
437
431
|
}[];
|
|
438
|
-
question: string;
|
|
439
432
|
criteria: string;
|
|
440
433
|
expectedOutput: {
|
|
441
434
|
role: "tool" | "assistant" | "user" | "system";
|
|
@@ -455,9 +448,10 @@ declare const CodeGraderInputSchema: z.ZodObject<{
|
|
|
455
448
|
name?: string | undefined;
|
|
456
449
|
metadata?: Record<string, unknown> | undefined;
|
|
457
450
|
}[];
|
|
458
|
-
|
|
451
|
+
outputText: string;
|
|
459
452
|
guidelineFiles: string[];
|
|
460
453
|
inputFiles: string[];
|
|
454
|
+
inputText: string;
|
|
461
455
|
output?: {
|
|
462
456
|
role: "tool" | "assistant" | "user" | "system";
|
|
463
457
|
startTime?: string | undefined;
|
|
@@ -479,7 +473,6 @@ declare const CodeGraderInputSchema: z.ZodObject<{
|
|
|
479
473
|
startTime?: string | null | undefined;
|
|
480
474
|
endTime?: string | null | undefined;
|
|
481
475
|
durationMs?: number | null | undefined;
|
|
482
|
-
referenceAnswer?: string | undefined;
|
|
483
476
|
outputPath?: string | undefined;
|
|
484
477
|
trace?: {
|
|
485
478
|
eventCount: number;
|
|
@@ -498,8 +491,6 @@ declare const CodeGraderInputSchema: z.ZodObject<{
|
|
|
498
491
|
fileChanges?: string | null | undefined;
|
|
499
492
|
workspacePath?: string | null | undefined;
|
|
500
493
|
config?: Record<string, unknown> | null | undefined;
|
|
501
|
-
inputText?: string | undefined;
|
|
502
|
-
outputText?: string | undefined;
|
|
503
494
|
expectedOutputText?: string | undefined;
|
|
504
495
|
}, {
|
|
505
496
|
input: {
|
|
@@ -520,7 +511,6 @@ declare const CodeGraderInputSchema: z.ZodObject<{
|
|
|
520
511
|
name?: string | undefined;
|
|
521
512
|
metadata?: Record<string, unknown> | undefined;
|
|
522
513
|
}[];
|
|
523
|
-
question: string;
|
|
524
514
|
criteria: string;
|
|
525
515
|
expectedOutput: {
|
|
526
516
|
role: "tool" | "assistant" | "user" | "system";
|
|
@@ -540,9 +530,10 @@ declare const CodeGraderInputSchema: z.ZodObject<{
|
|
|
540
530
|
name?: string | undefined;
|
|
541
531
|
metadata?: Record<string, unknown> | undefined;
|
|
542
532
|
}[];
|
|
543
|
-
|
|
533
|
+
outputText: string;
|
|
544
534
|
guidelineFiles: string[];
|
|
545
535
|
inputFiles: string[];
|
|
536
|
+
inputText: string;
|
|
546
537
|
output?: {
|
|
547
538
|
role: "tool" | "assistant" | "user" | "system";
|
|
548
539
|
startTime?: string | undefined;
|
|
@@ -564,7 +555,6 @@ declare const CodeGraderInputSchema: z.ZodObject<{
|
|
|
564
555
|
startTime?: string | null | undefined;
|
|
565
556
|
endTime?: string | null | undefined;
|
|
566
557
|
durationMs?: number | null | undefined;
|
|
567
|
-
referenceAnswer?: string | undefined;
|
|
568
558
|
outputPath?: string | undefined;
|
|
569
559
|
trace?: {
|
|
570
560
|
eventCount: number;
|
|
@@ -583,8 +573,6 @@ declare const CodeGraderInputSchema: z.ZodObject<{
|
|
|
583
573
|
fileChanges?: string | null | undefined;
|
|
584
574
|
workspacePath?: string | null | undefined;
|
|
585
575
|
config?: Record<string, unknown> | null | undefined;
|
|
586
|
-
inputText?: string | undefined;
|
|
587
|
-
outputText?: string | undefined;
|
|
588
576
|
expectedOutputText?: string | undefined;
|
|
589
577
|
}>;
|
|
590
578
|
/**
|
|
@@ -592,22 +580,36 @@ declare const CodeGraderInputSchema: z.ZodObject<{
|
|
|
592
580
|
*/
|
|
593
581
|
declare const CodeGraderResultSchema: z.ZodObject<{
|
|
594
582
|
score: z.ZodNumber;
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
583
|
+
assertions: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
584
|
+
text: z.ZodString;
|
|
585
|
+
passed: z.ZodBoolean;
|
|
586
|
+
evidence: z.ZodOptional<z.ZodString>;
|
|
587
|
+
}, "strip", z.ZodTypeAny, {
|
|
588
|
+
text: string;
|
|
589
|
+
passed: boolean;
|
|
590
|
+
evidence?: string | undefined;
|
|
591
|
+
}, {
|
|
592
|
+
text: string;
|
|
593
|
+
passed: boolean;
|
|
594
|
+
evidence?: string | undefined;
|
|
595
|
+
}>, "many">>>;
|
|
598
596
|
/** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */
|
|
599
597
|
details: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
600
598
|
}, "strip", z.ZodTypeAny, {
|
|
601
599
|
score: number;
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
600
|
+
assertions: {
|
|
601
|
+
text: string;
|
|
602
|
+
passed: boolean;
|
|
603
|
+
evidence?: string | undefined;
|
|
604
|
+
}[];
|
|
605
605
|
details?: Record<string, unknown> | undefined;
|
|
606
606
|
}, {
|
|
607
607
|
score: number;
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
608
|
+
assertions?: {
|
|
609
|
+
text: string;
|
|
610
|
+
passed: boolean;
|
|
611
|
+
evidence?: string | undefined;
|
|
612
|
+
}[] | undefined;
|
|
611
613
|
details?: Record<string, unknown> | undefined;
|
|
612
614
|
}>;
|
|
613
615
|
/**
|
|
@@ -618,19 +620,15 @@ type CodeGraderResult = z.infer<typeof CodeGraderResultSchema>;
|
|
|
618
620
|
/**
|
|
619
621
|
* CodeGraderInput after `enrichInput()` has run.
|
|
620
622
|
*
|
|
621
|
-
* The text
|
|
623
|
+
* The text accessors (`inputText`, `outputText`, `expectedOutputText`)
|
|
622
624
|
* are always populated by the runtime before the handler is called, so they are
|
|
623
625
|
* guaranteed to be `string` (never `undefined`).
|
|
624
626
|
*
|
|
625
627
|
* Handler function signatures (`CodeGraderHandler`, `AssertionHandler`) use this
|
|
626
628
|
* type so that user code can destructure `{ outputText }` without null-checks.
|
|
627
629
|
*/
|
|
628
|
-
type EnrichedCodeGraderInput = Omit<CodeGraderInput, '
|
|
629
|
-
/**
|
|
630
|
-
readonly inputText: string;
|
|
631
|
-
/** Last assistant message content as string. Replaces `answer`. */
|
|
632
|
-
readonly outputText: string;
|
|
633
|
-
/** Expected output content as string. Replaces `referenceAnswer`. */
|
|
630
|
+
type EnrichedCodeGraderInput = Omit<CodeGraderInput, 'expectedOutputText'> & {
|
|
631
|
+
/** Expected output content as string. */
|
|
634
632
|
readonly expectedOutputText: string;
|
|
635
633
|
};
|
|
636
634
|
type TraceSummary = z.infer<typeof TraceSummarySchema>;
|
|
@@ -642,8 +640,6 @@ type TokenUsage = z.infer<typeof TokenUsageSchema>;
|
|
|
642
640
|
* Uses the same schema as CodeGraderInput since the orchestrator sends identical payloads.
|
|
643
641
|
*/
|
|
644
642
|
declare const PromptTemplateInputSchema: z.ZodObject<{
|
|
645
|
-
/** @deprecated Use `inputText` instead. First user message content as string. */
|
|
646
|
-
question: z.ZodString;
|
|
647
643
|
criteria: z.ZodString;
|
|
648
644
|
expectedOutput: z.ZodArray<z.ZodObject<{
|
|
649
645
|
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
@@ -713,10 +709,8 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
|
|
|
713
709
|
name?: string | undefined;
|
|
714
710
|
metadata?: Record<string, unknown> | undefined;
|
|
715
711
|
}>, "many">;
|
|
716
|
-
/**
|
|
717
|
-
|
|
718
|
-
/** @deprecated Use `outputText` instead. Last assistant message content as string. */
|
|
719
|
-
answer: z.ZodString;
|
|
712
|
+
/** Last assistant message content as string. */
|
|
713
|
+
outputText: z.ZodString;
|
|
720
714
|
output: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
|
|
721
715
|
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
722
716
|
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
@@ -899,11 +893,9 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
|
|
|
899
893
|
fileChanges: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
900
894
|
workspacePath: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
901
895
|
config: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
|
|
902
|
-
/** First user message content as string.
|
|
903
|
-
inputText: z.
|
|
904
|
-
/**
|
|
905
|
-
outputText: z.ZodOptional<z.ZodString>;
|
|
906
|
-
/** Expected output content as string. Replaces `referenceAnswer`. */
|
|
896
|
+
/** First user message content as string. */
|
|
897
|
+
inputText: z.ZodString;
|
|
898
|
+
/** Expected output content as string. */
|
|
907
899
|
expectedOutputText: z.ZodOptional<z.ZodString>;
|
|
908
900
|
}, "strip", z.ZodTypeAny, {
|
|
909
901
|
input: {
|
|
@@ -924,7 +916,6 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
|
|
|
924
916
|
name?: string | undefined;
|
|
925
917
|
metadata?: Record<string, unknown> | undefined;
|
|
926
918
|
}[];
|
|
927
|
-
question: string;
|
|
928
919
|
criteria: string;
|
|
929
920
|
expectedOutput: {
|
|
930
921
|
role: "tool" | "assistant" | "user" | "system";
|
|
@@ -944,9 +935,10 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
|
|
|
944
935
|
name?: string | undefined;
|
|
945
936
|
metadata?: Record<string, unknown> | undefined;
|
|
946
937
|
}[];
|
|
947
|
-
|
|
938
|
+
outputText: string;
|
|
948
939
|
guidelineFiles: string[];
|
|
949
940
|
inputFiles: string[];
|
|
941
|
+
inputText: string;
|
|
950
942
|
output?: {
|
|
951
943
|
role: "tool" | "assistant" | "user" | "system";
|
|
952
944
|
startTime?: string | undefined;
|
|
@@ -968,7 +960,6 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
|
|
|
968
960
|
startTime?: string | null | undefined;
|
|
969
961
|
endTime?: string | null | undefined;
|
|
970
962
|
durationMs?: number | null | undefined;
|
|
971
|
-
referenceAnswer?: string | undefined;
|
|
972
963
|
outputPath?: string | undefined;
|
|
973
964
|
trace?: {
|
|
974
965
|
eventCount: number;
|
|
@@ -987,8 +978,6 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
|
|
|
987
978
|
fileChanges?: string | null | undefined;
|
|
988
979
|
workspacePath?: string | null | undefined;
|
|
989
980
|
config?: Record<string, unknown> | null | undefined;
|
|
990
|
-
inputText?: string | undefined;
|
|
991
|
-
outputText?: string | undefined;
|
|
992
981
|
expectedOutputText?: string | undefined;
|
|
993
982
|
}, {
|
|
994
983
|
input: {
|
|
@@ -1009,7 +998,6 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
|
|
|
1009
998
|
name?: string | undefined;
|
|
1010
999
|
metadata?: Record<string, unknown> | undefined;
|
|
1011
1000
|
}[];
|
|
1012
|
-
question: string;
|
|
1013
1001
|
criteria: string;
|
|
1014
1002
|
expectedOutput: {
|
|
1015
1003
|
role: "tool" | "assistant" | "user" | "system";
|
|
@@ -1029,9 +1017,10 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
|
|
|
1029
1017
|
name?: string | undefined;
|
|
1030
1018
|
metadata?: Record<string, unknown> | undefined;
|
|
1031
1019
|
}[];
|
|
1032
|
-
|
|
1020
|
+
outputText: string;
|
|
1033
1021
|
guidelineFiles: string[];
|
|
1034
1022
|
inputFiles: string[];
|
|
1023
|
+
inputText: string;
|
|
1035
1024
|
output?: {
|
|
1036
1025
|
role: "tool" | "assistant" | "user" | "system";
|
|
1037
1026
|
startTime?: string | undefined;
|
|
@@ -1053,7 +1042,6 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
|
|
|
1053
1042
|
startTime?: string | null | undefined;
|
|
1054
1043
|
endTime?: string | null | undefined;
|
|
1055
1044
|
durationMs?: number | null | undefined;
|
|
1056
|
-
referenceAnswer?: string | undefined;
|
|
1057
1045
|
outputPath?: string | undefined;
|
|
1058
1046
|
trace?: {
|
|
1059
1047
|
eventCount: number;
|
|
@@ -1072,15 +1060,11 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
|
|
|
1072
1060
|
fileChanges?: string | null | undefined;
|
|
1073
1061
|
workspacePath?: string | null | undefined;
|
|
1074
1062
|
config?: Record<string, unknown> | null | undefined;
|
|
1075
|
-
inputText?: string | undefined;
|
|
1076
|
-
outputText?: string | undefined;
|
|
1077
1063
|
expectedOutputText?: string | undefined;
|
|
1078
1064
|
}>;
|
|
1079
1065
|
type PromptTemplateInput = CodeGraderInput;
|
|
1080
1066
|
/** @deprecated Use CodeGraderInputSchema */
|
|
1081
1067
|
declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
1082
|
-
/** @deprecated Use `inputText` instead. First user message content as string. */
|
|
1083
|
-
question: z.ZodString;
|
|
1084
1068
|
criteria: z.ZodString;
|
|
1085
1069
|
expectedOutput: z.ZodArray<z.ZodObject<{
|
|
1086
1070
|
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
@@ -1150,10 +1134,8 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
1150
1134
|
name?: string | undefined;
|
|
1151
1135
|
metadata?: Record<string, unknown> | undefined;
|
|
1152
1136
|
}>, "many">;
|
|
1153
|
-
/**
|
|
1154
|
-
|
|
1155
|
-
/** @deprecated Use `outputText` instead. Last assistant message content as string. */
|
|
1156
|
-
answer: z.ZodString;
|
|
1137
|
+
/** Last assistant message content as string. */
|
|
1138
|
+
outputText: z.ZodString;
|
|
1157
1139
|
output: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
|
|
1158
1140
|
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
1159
1141
|
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
@@ -1336,11 +1318,9 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
1336
1318
|
fileChanges: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
1337
1319
|
workspacePath: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
1338
1320
|
config: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
|
|
1339
|
-
/** First user message content as string.
|
|
1340
|
-
inputText: z.
|
|
1341
|
-
/**
|
|
1342
|
-
outputText: z.ZodOptional<z.ZodString>;
|
|
1343
|
-
/** Expected output content as string. Replaces `referenceAnswer`. */
|
|
1321
|
+
/** First user message content as string. */
|
|
1322
|
+
inputText: z.ZodString;
|
|
1323
|
+
/** Expected output content as string. */
|
|
1344
1324
|
expectedOutputText: z.ZodOptional<z.ZodString>;
|
|
1345
1325
|
}, "strip", z.ZodTypeAny, {
|
|
1346
1326
|
input: {
|
|
@@ -1361,7 +1341,6 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
1361
1341
|
name?: string | undefined;
|
|
1362
1342
|
metadata?: Record<string, unknown> | undefined;
|
|
1363
1343
|
}[];
|
|
1364
|
-
question: string;
|
|
1365
1344
|
criteria: string;
|
|
1366
1345
|
expectedOutput: {
|
|
1367
1346
|
role: "tool" | "assistant" | "user" | "system";
|
|
@@ -1381,9 +1360,10 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
1381
1360
|
name?: string | undefined;
|
|
1382
1361
|
metadata?: Record<string, unknown> | undefined;
|
|
1383
1362
|
}[];
|
|
1384
|
-
|
|
1363
|
+
outputText: string;
|
|
1385
1364
|
guidelineFiles: string[];
|
|
1386
1365
|
inputFiles: string[];
|
|
1366
|
+
inputText: string;
|
|
1387
1367
|
output?: {
|
|
1388
1368
|
role: "tool" | "assistant" | "user" | "system";
|
|
1389
1369
|
startTime?: string | undefined;
|
|
@@ -1405,7 +1385,6 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
1405
1385
|
startTime?: string | null | undefined;
|
|
1406
1386
|
endTime?: string | null | undefined;
|
|
1407
1387
|
durationMs?: number | null | undefined;
|
|
1408
|
-
referenceAnswer?: string | undefined;
|
|
1409
1388
|
outputPath?: string | undefined;
|
|
1410
1389
|
trace?: {
|
|
1411
1390
|
eventCount: number;
|
|
@@ -1424,8 +1403,6 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
1424
1403
|
fileChanges?: string | null | undefined;
|
|
1425
1404
|
workspacePath?: string | null | undefined;
|
|
1426
1405
|
config?: Record<string, unknown> | null | undefined;
|
|
1427
|
-
inputText?: string | undefined;
|
|
1428
|
-
outputText?: string | undefined;
|
|
1429
1406
|
expectedOutputText?: string | undefined;
|
|
1430
1407
|
}, {
|
|
1431
1408
|
input: {
|
|
@@ -1446,7 +1423,6 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
1446
1423
|
name?: string | undefined;
|
|
1447
1424
|
metadata?: Record<string, unknown> | undefined;
|
|
1448
1425
|
}[];
|
|
1449
|
-
question: string;
|
|
1450
1426
|
criteria: string;
|
|
1451
1427
|
expectedOutput: {
|
|
1452
1428
|
role: "tool" | "assistant" | "user" | "system";
|
|
@@ -1466,9 +1442,10 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
1466
1442
|
name?: string | undefined;
|
|
1467
1443
|
metadata?: Record<string, unknown> | undefined;
|
|
1468
1444
|
}[];
|
|
1469
|
-
|
|
1445
|
+
outputText: string;
|
|
1470
1446
|
guidelineFiles: string[];
|
|
1471
1447
|
inputFiles: string[];
|
|
1448
|
+
inputText: string;
|
|
1472
1449
|
output?: {
|
|
1473
1450
|
role: "tool" | "assistant" | "user" | "system";
|
|
1474
1451
|
startTime?: string | undefined;
|
|
@@ -1490,7 +1467,6 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
1490
1467
|
startTime?: string | null | undefined;
|
|
1491
1468
|
endTime?: string | null | undefined;
|
|
1492
1469
|
durationMs?: number | null | undefined;
|
|
1493
|
-
referenceAnswer?: string | undefined;
|
|
1494
1470
|
outputPath?: string | undefined;
|
|
1495
1471
|
trace?: {
|
|
1496
1472
|
eventCount: number;
|
|
@@ -1509,29 +1485,41 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
1509
1485
|
fileChanges?: string | null | undefined;
|
|
1510
1486
|
workspacePath?: string | null | undefined;
|
|
1511
1487
|
config?: Record<string, unknown> | null | undefined;
|
|
1512
|
-
inputText?: string | undefined;
|
|
1513
|
-
outputText?: string | undefined;
|
|
1514
1488
|
expectedOutputText?: string | undefined;
|
|
1515
1489
|
}>;
|
|
1516
1490
|
/** @deprecated Use CodeGraderResultSchema */
|
|
1517
1491
|
declare const CodeJudgeResultSchema: z.ZodObject<{
|
|
1518
1492
|
score: z.ZodNumber;
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1493
|
+
assertions: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
1494
|
+
text: z.ZodString;
|
|
1495
|
+
passed: z.ZodBoolean;
|
|
1496
|
+
evidence: z.ZodOptional<z.ZodString>;
|
|
1497
|
+
}, "strip", z.ZodTypeAny, {
|
|
1498
|
+
text: string;
|
|
1499
|
+
passed: boolean;
|
|
1500
|
+
evidence?: string | undefined;
|
|
1501
|
+
}, {
|
|
1502
|
+
text: string;
|
|
1503
|
+
passed: boolean;
|
|
1504
|
+
evidence?: string | undefined;
|
|
1505
|
+
}>, "many">>>;
|
|
1522
1506
|
/** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1523
1507
|
details: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
1524
1508
|
}, "strip", z.ZodTypeAny, {
|
|
1525
1509
|
score: number;
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1510
|
+
assertions: {
|
|
1511
|
+
text: string;
|
|
1512
|
+
passed: boolean;
|
|
1513
|
+
evidence?: string | undefined;
|
|
1514
|
+
}[];
|
|
1529
1515
|
details?: Record<string, unknown> | undefined;
|
|
1530
1516
|
}, {
|
|
1531
1517
|
score: number;
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1518
|
+
assertions?: {
|
|
1519
|
+
text: string;
|
|
1520
|
+
passed: boolean;
|
|
1521
|
+
evidence?: string | undefined;
|
|
1522
|
+
}[] | undefined;
|
|
1535
1523
|
details?: Record<string, unknown> | undefined;
|
|
1536
1524
|
}>;
|
|
1537
1525
|
/** @deprecated Use CodeGraderInput */
|
|
@@ -1634,7 +1622,7 @@ declare class TargetInvocationError extends Error {
|
|
|
1634
1622
|
*
|
|
1635
1623
|
* if (!target) {
|
|
1636
1624
|
* // Target not available - no target config on this evaluator
|
|
1637
|
-
* return { score: 0.5,
|
|
1625
|
+
* return { score: 0.5, assertions: [{ text: 'Target not available', passed: false }] };
|
|
1638
1626
|
* }
|
|
1639
1627
|
*
|
|
1640
1628
|
* const response = await target.invoke({
|
|
@@ -1675,19 +1663,23 @@ type AssertionType = 'llm-grader' | 'code-grader' | 'rubrics' | 'composite' | 't
|
|
|
1675
1663
|
/**
|
|
1676
1664
|
* Result returned from an assertion handler.
|
|
1677
1665
|
*
|
|
1678
|
-
* @example Pass with
|
|
1666
|
+
* @example Pass with score
|
|
1679
1667
|
* ```ts
|
|
1680
|
-
* { pass: true,
|
|
1668
|
+
* { pass: true, assertions: [{ text: 'Output contains expected keywords', passed: true }] }
|
|
1681
1669
|
* ```
|
|
1682
1670
|
*
|
|
1683
|
-
* @example Fail with
|
|
1671
|
+
* @example Fail with evidence
|
|
1684
1672
|
* ```ts
|
|
1685
|
-
* { pass: false,
|
|
1673
|
+
* { pass: false, score: 0.3, assertions: [{ text: 'Missing required header', passed: false }] }
|
|
1686
1674
|
* ```
|
|
1687
1675
|
*
|
|
1688
1676
|
* @example Granular score (0-1)
|
|
1689
1677
|
* ```ts
|
|
1690
|
-
* { score: 0.75,
|
|
1678
|
+
* { score: 0.75, assertions: [
|
|
1679
|
+
* { text: 'Format correct', passed: true },
|
|
1680
|
+
* { text: 'Content relevant', passed: true },
|
|
1681
|
+
* { text: 'Missing citation', passed: false },
|
|
1682
|
+
* ] }
|
|
1691
1683
|
* ```
|
|
1692
1684
|
*/
|
|
1693
1685
|
interface AssertionScore {
|
|
@@ -1695,12 +1687,12 @@ interface AssertionScore {
|
|
|
1695
1687
|
readonly pass?: boolean;
|
|
1696
1688
|
/** Numeric score between 0 and 1. Defaults to 1 if pass=true, 0 if pass=false. */
|
|
1697
1689
|
readonly score?: number;
|
|
1698
|
-
/**
|
|
1699
|
-
readonly
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
|
|
1690
|
+
/** Per-assertion verdicts with optional evidence. */
|
|
1691
|
+
readonly assertions?: readonly {
|
|
1692
|
+
readonly text: string;
|
|
1693
|
+
readonly passed: boolean;
|
|
1694
|
+
readonly evidence?: string;
|
|
1695
|
+
}[];
|
|
1704
1696
|
/** Optional structured details for domain-specific metrics. */
|
|
1705
1697
|
readonly details?: Record<string, unknown>;
|
|
1706
1698
|
}
|
|
@@ -1740,7 +1732,7 @@ type CodeJudgeHandler = CodeGraderHandler;
|
|
|
1740
1732
|
*
|
|
1741
1733
|
* export default defineAssertion(({ outputText }) => ({
|
|
1742
1734
|
* pass: outputText.includes('hello'),
|
|
1743
|
-
*
|
|
1735
|
+
* assertions: [{ text: 'Checks greeting', passed: outputText.includes('hello') }],
|
|
1744
1736
|
* }));
|
|
1745
1737
|
* ```
|
|
1746
1738
|
*
|
|
@@ -1751,8 +1743,7 @@ type CodeJudgeHandler = CodeGraderHandler;
|
|
|
1751
1743
|
*
|
|
1752
1744
|
* export default defineCodeGrader(({ trace, outputText }) => ({
|
|
1753
1745
|
* score: trace?.eventCount <= 5 ? 1.0 : 0.5,
|
|
1754
|
-
*
|
|
1755
|
-
* misses: [],
|
|
1746
|
+
* assertions: [{ text: 'Efficient tool usage', passed: trace?.eventCount <= 5 }],
|
|
1756
1747
|
* }));
|
|
1757
1748
|
* ```
|
|
1758
1749
|
*
|
|
@@ -1764,7 +1755,7 @@ type CodeJudgeHandler = CodeGraderHandler;
|
|
|
1764
1755
|
* export default defineCodeGrader(async ({ inputText }) => {
|
|
1765
1756
|
* const target = createTargetClient();
|
|
1766
1757
|
* if (!target) {
|
|
1767
|
-
* return { score: 0,
|
|
1758
|
+
* return { score: 0, assertions: [{ text: 'Target not available', passed: false }] };
|
|
1768
1759
|
* }
|
|
1769
1760
|
*
|
|
1770
1761
|
* const response = await target.invoke({
|
|
@@ -1798,14 +1789,13 @@ type CodeJudgeHandler = CodeGraderHandler;
|
|
|
1798
1789
|
*
|
|
1799
1790
|
* export default defineCodeGrader(({ trace }) => {
|
|
1800
1791
|
* if (!trace) {
|
|
1801
|
-
* return { score: 0.5,
|
|
1792
|
+
* return { score: 0.5, assertions: [{ text: 'No trace available', passed: false }] };
|
|
1802
1793
|
* }
|
|
1803
1794
|
*
|
|
1804
1795
|
* const efficient = trace.eventCount <= 10;
|
|
1805
1796
|
* return {
|
|
1806
1797
|
* score: efficient ? 1.0 : 0.5,
|
|
1807
|
-
*
|
|
1808
|
-
* misses: efficient ? [] : ['Too many tool calls'],
|
|
1798
|
+
* assertions: [{ text: efficient ? 'Efficient execution' : 'Too many tool calls', passed: efficient }],
|
|
1809
1799
|
* };
|
|
1810
1800
|
* });
|
|
1811
1801
|
* ```
|
|
@@ -1889,7 +1879,7 @@ declare function definePromptTemplate(handler: PromptTemplateHandler): void;
|
|
|
1889
1879
|
*
|
|
1890
1880
|
* export default defineAssertion(({ outputText }) => ({
|
|
1891
1881
|
* pass: outputText.toLowerCase().includes('hello'),
|
|
1892
|
-
*
|
|
1882
|
+
* assertions: [{ text: 'Checks for greeting', passed: outputText.toLowerCase().includes('hello') }],
|
|
1893
1883
|
* }));
|
|
1894
1884
|
* ```
|
|
1895
1885
|
*
|
|
@@ -1902,9 +1892,9 @@ declare function definePromptTemplate(handler: PromptTemplateHandler): void;
|
|
|
1902
1892
|
* const isEfficient = (trace?.eventCount ?? 0) <= 5 ? 0.5 : 0;
|
|
1903
1893
|
* return {
|
|
1904
1894
|
* score: hasContent + isEfficient,
|
|
1905
|
-
*
|
|
1906
|
-
*
|
|
1907
|
-
*
|
|
1895
|
+
* assertions: [
|
|
1896
|
+
* { text: 'Has content', passed: !!hasContent },
|
|
1897
|
+
* { text: 'Efficient', passed: !!isEfficient },
|
|
1908
1898
|
* ],
|
|
1909
1899
|
* };
|
|
1910
1900
|
* });
|