@agentv/eval 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/index.cjs +124 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +295 -15
- package/dist/index.d.ts +295 -15
- package/dist/index.js +120 -10
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -74,7 +74,7 @@ declare const TraceSummarySchema: z.ZodObject<{
|
|
|
74
74
|
toolDurations?: Record<string, number[]> | undefined;
|
|
75
75
|
}>;
|
|
76
76
|
/**
|
|
77
|
-
* Tool call schema
|
|
77
|
+
* Tool call schema.
|
|
78
78
|
*/
|
|
79
79
|
declare const ToolCallSchema: z.ZodObject<{
|
|
80
80
|
tool: z.ZodString;
|
|
@@ -96,9 +96,9 @@ declare const ToolCallSchema: z.ZodObject<{
|
|
|
96
96
|
timestamp?: string | undefined;
|
|
97
97
|
}>;
|
|
98
98
|
/**
|
|
99
|
-
*
|
|
99
|
+
* Unified message schema for input, expected, and output messages.
|
|
100
100
|
*/
|
|
101
|
-
declare const
|
|
101
|
+
declare const MessageSchema: z.ZodObject<{
|
|
102
102
|
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
103
103
|
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
104
104
|
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
@@ -120,6 +120,7 @@ declare const OutputMessageSchema: z.ZodObject<{
|
|
|
120
120
|
id?: string | undefined;
|
|
121
121
|
timestamp?: string | undefined;
|
|
122
122
|
}>, "many">>;
|
|
123
|
+
name: z.ZodOptional<z.ZodString>;
|
|
123
124
|
timestamp: z.ZodOptional<z.ZodString>;
|
|
124
125
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
125
126
|
}, "strip", z.ZodTypeAny, {
|
|
@@ -133,6 +134,7 @@ declare const OutputMessageSchema: z.ZodObject<{
|
|
|
133
134
|
id?: string | undefined;
|
|
134
135
|
timestamp?: string | undefined;
|
|
135
136
|
}[] | undefined;
|
|
137
|
+
name?: string | undefined;
|
|
136
138
|
metadata?: Record<string, unknown> | undefined;
|
|
137
139
|
}, {
|
|
138
140
|
role: "tool" | "assistant" | "user" | "system";
|
|
@@ -145,6 +147,7 @@ declare const OutputMessageSchema: z.ZodObject<{
|
|
|
145
147
|
id?: string | undefined;
|
|
146
148
|
timestamp?: string | undefined;
|
|
147
149
|
}[] | undefined;
|
|
150
|
+
name?: string | undefined;
|
|
148
151
|
metadata?: Record<string, unknown> | undefined;
|
|
149
152
|
}>;
|
|
150
153
|
/**
|
|
@@ -153,7 +156,58 @@ declare const OutputMessageSchema: z.ZodObject<{
|
|
|
153
156
|
declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
154
157
|
question: z.ZodString;
|
|
155
158
|
expectedOutcome: z.ZodString;
|
|
156
|
-
expectedMessages: z.ZodArray<z.
|
|
159
|
+
expectedMessages: z.ZodArray<z.ZodObject<{
|
|
160
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
161
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
162
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
163
|
+
tool: z.ZodString;
|
|
164
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
165
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
166
|
+
id: z.ZodOptional<z.ZodString>;
|
|
167
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
168
|
+
}, "strip", z.ZodTypeAny, {
|
|
169
|
+
tool: string;
|
|
170
|
+
input?: unknown;
|
|
171
|
+
output?: unknown;
|
|
172
|
+
id?: string | undefined;
|
|
173
|
+
timestamp?: string | undefined;
|
|
174
|
+
}, {
|
|
175
|
+
tool: string;
|
|
176
|
+
input?: unknown;
|
|
177
|
+
output?: unknown;
|
|
178
|
+
id?: string | undefined;
|
|
179
|
+
timestamp?: string | undefined;
|
|
180
|
+
}>, "many">>;
|
|
181
|
+
name: z.ZodOptional<z.ZodString>;
|
|
182
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
183
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
184
|
+
}, "strip", z.ZodTypeAny, {
|
|
185
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
186
|
+
timestamp?: string | undefined;
|
|
187
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
188
|
+
toolCalls?: {
|
|
189
|
+
tool: string;
|
|
190
|
+
input?: unknown;
|
|
191
|
+
output?: unknown;
|
|
192
|
+
id?: string | undefined;
|
|
193
|
+
timestamp?: string | undefined;
|
|
194
|
+
}[] | undefined;
|
|
195
|
+
name?: string | undefined;
|
|
196
|
+
metadata?: Record<string, unknown> | undefined;
|
|
197
|
+
}, {
|
|
198
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
199
|
+
timestamp?: string | undefined;
|
|
200
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
201
|
+
toolCalls?: {
|
|
202
|
+
tool: string;
|
|
203
|
+
input?: unknown;
|
|
204
|
+
output?: unknown;
|
|
205
|
+
id?: string | undefined;
|
|
206
|
+
timestamp?: string | undefined;
|
|
207
|
+
}[] | undefined;
|
|
208
|
+
name?: string | undefined;
|
|
209
|
+
metadata?: Record<string, unknown> | undefined;
|
|
210
|
+
}>, "many">;
|
|
157
211
|
referenceAnswer: z.ZodOptional<z.ZodString>;
|
|
158
212
|
candidateAnswer: z.ZodString;
|
|
159
213
|
outputMessages: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
|
|
@@ -178,6 +232,7 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
178
232
|
id?: string | undefined;
|
|
179
233
|
timestamp?: string | undefined;
|
|
180
234
|
}>, "many">>;
|
|
235
|
+
name: z.ZodOptional<z.ZodString>;
|
|
181
236
|
timestamp: z.ZodOptional<z.ZodString>;
|
|
182
237
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
183
238
|
}, "strip", z.ZodTypeAny, {
|
|
@@ -191,6 +246,7 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
191
246
|
id?: string | undefined;
|
|
192
247
|
timestamp?: string | undefined;
|
|
193
248
|
}[] | undefined;
|
|
249
|
+
name?: string | undefined;
|
|
194
250
|
metadata?: Record<string, unknown> | undefined;
|
|
195
251
|
}, {
|
|
196
252
|
role: "tool" | "assistant" | "user" | "system";
|
|
@@ -203,19 +259,62 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
203
259
|
id?: string | undefined;
|
|
204
260
|
timestamp?: string | undefined;
|
|
205
261
|
}[] | undefined;
|
|
262
|
+
name?: string | undefined;
|
|
206
263
|
metadata?: Record<string, unknown> | undefined;
|
|
207
264
|
}>, "many">>>;
|
|
208
265
|
guidelineFiles: z.ZodArray<z.ZodString, "many">;
|
|
209
266
|
inputFiles: z.ZodArray<z.ZodString, "many">;
|
|
210
267
|
inputMessages: z.ZodArray<z.ZodObject<{
|
|
211
|
-
role: z.ZodEnum<["
|
|
212
|
-
content: z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]
|
|
268
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
269
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
270
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
271
|
+
tool: z.ZodString;
|
|
272
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
273
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
274
|
+
id: z.ZodOptional<z.ZodString>;
|
|
275
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
276
|
+
}, "strip", z.ZodTypeAny, {
|
|
277
|
+
tool: string;
|
|
278
|
+
input?: unknown;
|
|
279
|
+
output?: unknown;
|
|
280
|
+
id?: string | undefined;
|
|
281
|
+
timestamp?: string | undefined;
|
|
282
|
+
}, {
|
|
283
|
+
tool: string;
|
|
284
|
+
input?: unknown;
|
|
285
|
+
output?: unknown;
|
|
286
|
+
id?: string | undefined;
|
|
287
|
+
timestamp?: string | undefined;
|
|
288
|
+
}>, "many">>;
|
|
289
|
+
name: z.ZodOptional<z.ZodString>;
|
|
290
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
291
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
213
292
|
}, "strip", z.ZodTypeAny, {
|
|
214
293
|
role: "tool" | "assistant" | "user" | "system";
|
|
215
|
-
|
|
294
|
+
timestamp?: string | undefined;
|
|
295
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
296
|
+
toolCalls?: {
|
|
297
|
+
tool: string;
|
|
298
|
+
input?: unknown;
|
|
299
|
+
output?: unknown;
|
|
300
|
+
id?: string | undefined;
|
|
301
|
+
timestamp?: string | undefined;
|
|
302
|
+
}[] | undefined;
|
|
303
|
+
name?: string | undefined;
|
|
304
|
+
metadata?: Record<string, unknown> | undefined;
|
|
216
305
|
}, {
|
|
217
306
|
role: "tool" | "assistant" | "user" | "system";
|
|
218
|
-
|
|
307
|
+
timestamp?: string | undefined;
|
|
308
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
309
|
+
toolCalls?: {
|
|
310
|
+
tool: string;
|
|
311
|
+
input?: unknown;
|
|
312
|
+
output?: unknown;
|
|
313
|
+
id?: string | undefined;
|
|
314
|
+
timestamp?: string | undefined;
|
|
315
|
+
}[] | undefined;
|
|
316
|
+
name?: string | undefined;
|
|
317
|
+
metadata?: Record<string, unknown> | undefined;
|
|
219
318
|
}>, "many">;
|
|
220
319
|
traceSummary: z.ZodOptional<z.ZodNullable<z.ZodObject<{
|
|
221
320
|
eventCount: z.ZodNumber;
|
|
@@ -269,13 +368,36 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
269
368
|
}, "strip", z.ZodTypeAny, {
|
|
270
369
|
question: string;
|
|
271
370
|
expectedOutcome: string;
|
|
272
|
-
expectedMessages:
|
|
371
|
+
expectedMessages: {
|
|
372
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
373
|
+
timestamp?: string | undefined;
|
|
374
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
375
|
+
toolCalls?: {
|
|
376
|
+
tool: string;
|
|
377
|
+
input?: unknown;
|
|
378
|
+
output?: unknown;
|
|
379
|
+
id?: string | undefined;
|
|
380
|
+
timestamp?: string | undefined;
|
|
381
|
+
}[] | undefined;
|
|
382
|
+
name?: string | undefined;
|
|
383
|
+
metadata?: Record<string, unknown> | undefined;
|
|
384
|
+
}[];
|
|
273
385
|
candidateAnswer: string;
|
|
274
386
|
guidelineFiles: string[];
|
|
275
387
|
inputFiles: string[];
|
|
276
388
|
inputMessages: {
|
|
277
389
|
role: "tool" | "assistant" | "user" | "system";
|
|
278
|
-
|
|
390
|
+
timestamp?: string | undefined;
|
|
391
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
392
|
+
toolCalls?: {
|
|
393
|
+
tool: string;
|
|
394
|
+
input?: unknown;
|
|
395
|
+
output?: unknown;
|
|
396
|
+
id?: string | undefined;
|
|
397
|
+
timestamp?: string | undefined;
|
|
398
|
+
}[] | undefined;
|
|
399
|
+
name?: string | undefined;
|
|
400
|
+
metadata?: Record<string, unknown> | undefined;
|
|
279
401
|
}[];
|
|
280
402
|
referenceAnswer?: string | undefined;
|
|
281
403
|
outputMessages?: {
|
|
@@ -289,6 +411,7 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
289
411
|
id?: string | undefined;
|
|
290
412
|
timestamp?: string | undefined;
|
|
291
413
|
}[] | undefined;
|
|
414
|
+
name?: string | undefined;
|
|
292
415
|
metadata?: Record<string, unknown> | undefined;
|
|
293
416
|
}[] | null | undefined;
|
|
294
417
|
traceSummary?: {
|
|
@@ -309,13 +432,36 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
309
432
|
}, {
|
|
310
433
|
question: string;
|
|
311
434
|
expectedOutcome: string;
|
|
312
|
-
expectedMessages:
|
|
435
|
+
expectedMessages: {
|
|
436
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
437
|
+
timestamp?: string | undefined;
|
|
438
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
439
|
+
toolCalls?: {
|
|
440
|
+
tool: string;
|
|
441
|
+
input?: unknown;
|
|
442
|
+
output?: unknown;
|
|
443
|
+
id?: string | undefined;
|
|
444
|
+
timestamp?: string | undefined;
|
|
445
|
+
}[] | undefined;
|
|
446
|
+
name?: string | undefined;
|
|
447
|
+
metadata?: Record<string, unknown> | undefined;
|
|
448
|
+
}[];
|
|
313
449
|
candidateAnswer: string;
|
|
314
450
|
guidelineFiles: string[];
|
|
315
451
|
inputFiles: string[];
|
|
316
452
|
inputMessages: {
|
|
317
453
|
role: "tool" | "assistant" | "user" | "system";
|
|
318
|
-
|
|
454
|
+
timestamp?: string | undefined;
|
|
455
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
456
|
+
toolCalls?: {
|
|
457
|
+
tool: string;
|
|
458
|
+
input?: unknown;
|
|
459
|
+
output?: unknown;
|
|
460
|
+
id?: string | undefined;
|
|
461
|
+
timestamp?: string | undefined;
|
|
462
|
+
}[] | undefined;
|
|
463
|
+
name?: string | undefined;
|
|
464
|
+
metadata?: Record<string, unknown> | undefined;
|
|
319
465
|
}[];
|
|
320
466
|
referenceAnswer?: string | undefined;
|
|
321
467
|
outputMessages?: {
|
|
@@ -329,6 +475,7 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
329
475
|
id?: string | undefined;
|
|
330
476
|
timestamp?: string | undefined;
|
|
331
477
|
}[] | undefined;
|
|
478
|
+
name?: string | undefined;
|
|
332
479
|
metadata?: Record<string, unknown> | undefined;
|
|
333
480
|
}[] | null | undefined;
|
|
334
481
|
traceSummary?: {
|
|
@@ -355,16 +502,20 @@ declare const CodeJudgeResultSchema: z.ZodObject<{
|
|
|
355
502
|
hits: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
|
|
356
503
|
misses: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
|
|
357
504
|
reasoning: z.ZodOptional<z.ZodString>;
|
|
505
|
+
/** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */
|
|
506
|
+
details: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
358
507
|
}, "strip", z.ZodTypeAny, {
|
|
359
508
|
score: number;
|
|
360
509
|
hits: string[];
|
|
361
510
|
misses: string[];
|
|
362
511
|
reasoning?: string | undefined;
|
|
512
|
+
details?: Record<string, unknown> | undefined;
|
|
363
513
|
}, {
|
|
364
514
|
score: number;
|
|
365
515
|
hits?: string[] | undefined;
|
|
366
516
|
misses?: string[] | undefined;
|
|
367
517
|
reasoning?: string | undefined;
|
|
518
|
+
details?: Record<string, unknown> | undefined;
|
|
368
519
|
}>;
|
|
369
520
|
/**
|
|
370
521
|
* Inferred types from schemas.
|
|
@@ -372,10 +523,118 @@ declare const CodeJudgeResultSchema: z.ZodObject<{
|
|
|
372
523
|
type CodeJudgeInput = z.infer<typeof CodeJudgeInputSchema>;
|
|
373
524
|
type CodeJudgeResult = z.infer<typeof CodeJudgeResultSchema>;
|
|
374
525
|
type TraceSummary = z.infer<typeof TraceSummarySchema>;
|
|
375
|
-
type
|
|
526
|
+
type Message = z.infer<typeof MessageSchema>;
|
|
376
527
|
type ToolCall = z.infer<typeof ToolCallSchema>;
|
|
377
528
|
type TokenUsage = z.infer<typeof TokenUsageSchema>;
|
|
378
529
|
|
|
530
|
+
/**
|
|
531
|
+
* Client for invoking configured targets from code_judge scripts.
|
|
532
|
+
*
|
|
533
|
+
* Environment variables (set automatically by AgentV when `target` config is present):
|
|
534
|
+
* - AGENTV_TARGET_PROXY_URL: The URL of the local proxy server
|
|
535
|
+
* - AGENTV_TARGET_PROXY_TOKEN: Bearer token for authentication
|
|
536
|
+
*/
|
|
537
|
+
/**
|
|
538
|
+
* Request to invoke the target
|
|
539
|
+
*/
|
|
540
|
+
interface TargetInvokeRequest {
|
|
541
|
+
readonly question: string;
|
|
542
|
+
readonly systemPrompt?: string;
|
|
543
|
+
readonly evalCaseId?: string;
|
|
544
|
+
readonly attempt?: number;
|
|
545
|
+
/** Optional target override - use a different target for this invocation */
|
|
546
|
+
readonly target?: string;
|
|
547
|
+
}
|
|
548
|
+
/**
|
|
549
|
+
* Response from a target invocation
|
|
550
|
+
*/
|
|
551
|
+
interface TargetInvokeResponse {
|
|
552
|
+
readonly outputMessages: readonly unknown[];
|
|
553
|
+
readonly rawText?: string;
|
|
554
|
+
}
|
|
555
|
+
/**
|
|
556
|
+
* Information about the target proxy configuration
|
|
557
|
+
*/
|
|
558
|
+
interface TargetInfo {
|
|
559
|
+
/** Name of the default target being used */
|
|
560
|
+
readonly targetName: string;
|
|
561
|
+
/** Maximum number of calls allowed */
|
|
562
|
+
readonly maxCalls: number;
|
|
563
|
+
/** Current number of calls made */
|
|
564
|
+
readonly callCount: number;
|
|
565
|
+
/** List of all available target names */
|
|
566
|
+
readonly availableTargets: readonly string[];
|
|
567
|
+
}
|
|
568
|
+
/**
|
|
569
|
+
* Target client for making target invocations
|
|
570
|
+
*/
|
|
571
|
+
interface TargetClient {
|
|
572
|
+
/**
|
|
573
|
+
* Invoke the configured target with a prompt.
|
|
574
|
+
* @param request - The question and optional system prompt
|
|
575
|
+
* @returns The target's response with output messages and optional raw text
|
|
576
|
+
*/
|
|
577
|
+
invoke(request: TargetInvokeRequest): Promise<TargetInvokeResponse>;
|
|
578
|
+
/**
|
|
579
|
+
* Invoke the target with multiple requests in sequence.
|
|
580
|
+
* Each request counts toward the max_calls limit.
|
|
581
|
+
* @param requests - Array of target requests
|
|
582
|
+
* @returns Array of target responses
|
|
583
|
+
*/
|
|
584
|
+
invokeBatch(requests: readonly TargetInvokeRequest[]): Promise<readonly TargetInvokeResponse[]>;
|
|
585
|
+
/**
|
|
586
|
+
* Get information about the target proxy configuration.
|
|
587
|
+
* Returns the default target name, max calls, current call count, and available targets.
|
|
588
|
+
*/
|
|
589
|
+
getInfo(): Promise<TargetInfo>;
|
|
590
|
+
}
|
|
591
|
+
/**
|
|
592
|
+
* Error thrown when target proxy is not available
|
|
593
|
+
*/
|
|
594
|
+
declare class TargetNotAvailableError extends Error {
|
|
595
|
+
constructor(message: string);
|
|
596
|
+
}
|
|
597
|
+
/**
|
|
598
|
+
* Error thrown when target invocation fails
|
|
599
|
+
*/
|
|
600
|
+
declare class TargetInvocationError extends Error {
|
|
601
|
+
readonly statusCode?: number;
|
|
602
|
+
constructor(message: string, statusCode?: number);
|
|
603
|
+
}
|
|
604
|
+
/**
|
|
605
|
+
* Create a target client from environment variables.
|
|
606
|
+
*
|
|
607
|
+
* This function reads the proxy URL and token from environment variables
|
|
608
|
+
* that are automatically set by AgentV when a `target` config block is present
|
|
609
|
+
* on a `code_judge` evaluator.
|
|
610
|
+
*
|
|
611
|
+
* @returns A target client if environment variables are set, otherwise undefined
|
|
612
|
+
* @throws TargetNotAvailableError if token is missing when URL is present
|
|
613
|
+
*
|
|
614
|
+
* @example
|
|
615
|
+
* ```typescript
|
|
616
|
+
* import { createTargetClient, defineCodeJudge } from '@agentv/eval';
|
|
617
|
+
*
|
|
618
|
+
* export default defineCodeJudge(async ({ question, expectedOutcome }) => {
|
|
619
|
+
* const target = createTargetClient();
|
|
620
|
+
*
|
|
621
|
+
* if (!target) {
|
|
622
|
+
* // Target not available - no target config on this evaluator
|
|
623
|
+
* return { score: 0.5, reasoning: 'Target not available' };
|
|
624
|
+
* }
|
|
625
|
+
*
|
|
626
|
+
* const response = await target.invoke({
|
|
627
|
+
* question: `Is this answer correct? Question: ${question}, Expected: ${expectedOutcome}`,
|
|
628
|
+
* systemPrompt: 'You are an expert evaluator. Respond with JSON: { "correct": true/false }'
|
|
629
|
+
* });
|
|
630
|
+
*
|
|
631
|
+
* const result = JSON.parse(response.rawText ?? '{}');
|
|
632
|
+
* return { score: result.correct ? 1.0 : 0.0 };
|
|
633
|
+
* });
|
|
634
|
+
* ```
|
|
635
|
+
*/
|
|
636
|
+
declare function createTargetClient(): TargetClient | undefined;
|
|
637
|
+
|
|
379
638
|
/**
|
|
380
639
|
* Handler function type for code judges.
|
|
381
640
|
*/
|
|
@@ -386,7 +645,7 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
|
|
|
386
645
|
*
|
|
387
646
|
* Build custom code judges for evaluating AI agent outputs.
|
|
388
647
|
*
|
|
389
|
-
* @example
|
|
648
|
+
* @example Basic code judge
|
|
390
649
|
* ```typescript
|
|
391
650
|
* #!/usr/bin/env bun
|
|
392
651
|
* import { defineCodeJudge } from '@agentv/eval';
|
|
@@ -398,6 +657,27 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
|
|
|
398
657
|
* }));
|
|
399
658
|
* ```
|
|
400
659
|
*
|
|
660
|
+
* @example Code judge with target access (requires `target` config in YAML)
|
|
661
|
+
* ```typescript
|
|
662
|
+
* #!/usr/bin/env bun
|
|
663
|
+
* import { defineCodeJudge, createTargetClient } from '@agentv/eval';
|
|
664
|
+
*
|
|
665
|
+
* export default defineCodeJudge(async ({ question }) => {
|
|
666
|
+
* const target = createTargetClient();
|
|
667
|
+
* if (!target) {
|
|
668
|
+
* return { score: 0, misses: ['Target not available'] };
|
|
669
|
+
* }
|
|
670
|
+
*
|
|
671
|
+
* const response = await target.invoke({
|
|
672
|
+
* question: `Evaluate: ${question}`,
|
|
673
|
+
* systemPrompt: 'Respond with JSON: { "score": 0-1 }'
|
|
674
|
+
* });
|
|
675
|
+
*
|
|
676
|
+
* const result = JSON.parse(response.rawText ?? '{}');
|
|
677
|
+
* return { score: result.score ?? 0 };
|
|
678
|
+
* });
|
|
679
|
+
* ```
|
|
680
|
+
*
|
|
401
681
|
* @packageDocumentation
|
|
402
682
|
*/
|
|
403
683
|
|
|
@@ -447,4 +727,4 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
|
|
|
447
727
|
*/
|
|
448
728
|
declare function defineCodeJudge(handler: CodeJudgeHandler): void;
|
|
449
729
|
|
|
450
|
-
export { type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type
|
|
730
|
+
export { type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type Message, MessageSchema, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, createTargetClient, defineCodeJudge };
|