@agentv/eval 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/index.cjs +124 -13
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +295 -23
- package/dist/index.d.ts +295 -23
- package/dist/index.js +120 -12
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -74,7 +74,7 @@ declare const TraceSummarySchema: z.ZodObject<{
|
|
|
74
74
|
toolDurations?: Record<string, number[]> | undefined;
|
|
75
75
|
}>;
|
|
76
76
|
/**
|
|
77
|
-
* Tool call schema
|
|
77
|
+
* Tool call schema.
|
|
78
78
|
*/
|
|
79
79
|
declare const ToolCallSchema: z.ZodObject<{
|
|
80
80
|
tool: z.ZodString;
|
|
@@ -96,11 +96,10 @@ declare const ToolCallSchema: z.ZodObject<{
|
|
|
96
96
|
timestamp?: string | undefined;
|
|
97
97
|
}>;
|
|
98
98
|
/**
|
|
99
|
-
*
|
|
99
|
+
* Unified message schema for input, expected, and output messages.
|
|
100
100
|
*/
|
|
101
|
-
declare const
|
|
101
|
+
declare const MessageSchema: z.ZodObject<{
|
|
102
102
|
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
103
|
-
name: z.ZodOptional<z.ZodString>;
|
|
104
103
|
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
105
104
|
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
106
105
|
tool: z.ZodString;
|
|
@@ -121,12 +120,12 @@ declare const OutputMessageSchema: z.ZodObject<{
|
|
|
121
120
|
id?: string | undefined;
|
|
122
121
|
timestamp?: string | undefined;
|
|
123
122
|
}>, "many">>;
|
|
123
|
+
name: z.ZodOptional<z.ZodString>;
|
|
124
124
|
timestamp: z.ZodOptional<z.ZodString>;
|
|
125
125
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
126
126
|
}, "strip", z.ZodTypeAny, {
|
|
127
127
|
role: "tool" | "assistant" | "user" | "system";
|
|
128
128
|
timestamp?: string | undefined;
|
|
129
|
-
name?: string | undefined;
|
|
130
129
|
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
131
130
|
toolCalls?: {
|
|
132
131
|
tool: string;
|
|
@@ -135,11 +134,11 @@ declare const OutputMessageSchema: z.ZodObject<{
|
|
|
135
134
|
id?: string | undefined;
|
|
136
135
|
timestamp?: string | undefined;
|
|
137
136
|
}[] | undefined;
|
|
137
|
+
name?: string | undefined;
|
|
138
138
|
metadata?: Record<string, unknown> | undefined;
|
|
139
139
|
}, {
|
|
140
140
|
role: "tool" | "assistant" | "user" | "system";
|
|
141
141
|
timestamp?: string | undefined;
|
|
142
|
-
name?: string | undefined;
|
|
143
142
|
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
144
143
|
toolCalls?: {
|
|
145
144
|
tool: string;
|
|
@@ -148,6 +147,7 @@ declare const OutputMessageSchema: z.ZodObject<{
|
|
|
148
147
|
id?: string | undefined;
|
|
149
148
|
timestamp?: string | undefined;
|
|
150
149
|
}[] | undefined;
|
|
150
|
+
name?: string | undefined;
|
|
151
151
|
metadata?: Record<string, unknown> | undefined;
|
|
152
152
|
}>;
|
|
153
153
|
/**
|
|
@@ -156,12 +156,62 @@ declare const OutputMessageSchema: z.ZodObject<{
|
|
|
156
156
|
declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
157
157
|
question: z.ZodString;
|
|
158
158
|
expectedOutcome: z.ZodString;
|
|
159
|
-
expectedMessages: z.ZodArray<z.
|
|
159
|
+
expectedMessages: z.ZodArray<z.ZodObject<{
|
|
160
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
161
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
162
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
163
|
+
tool: z.ZodString;
|
|
164
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
165
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
166
|
+
id: z.ZodOptional<z.ZodString>;
|
|
167
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
168
|
+
}, "strip", z.ZodTypeAny, {
|
|
169
|
+
tool: string;
|
|
170
|
+
input?: unknown;
|
|
171
|
+
output?: unknown;
|
|
172
|
+
id?: string | undefined;
|
|
173
|
+
timestamp?: string | undefined;
|
|
174
|
+
}, {
|
|
175
|
+
tool: string;
|
|
176
|
+
input?: unknown;
|
|
177
|
+
output?: unknown;
|
|
178
|
+
id?: string | undefined;
|
|
179
|
+
timestamp?: string | undefined;
|
|
180
|
+
}>, "many">>;
|
|
181
|
+
name: z.ZodOptional<z.ZodString>;
|
|
182
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
183
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
184
|
+
}, "strip", z.ZodTypeAny, {
|
|
185
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
186
|
+
timestamp?: string | undefined;
|
|
187
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
188
|
+
toolCalls?: {
|
|
189
|
+
tool: string;
|
|
190
|
+
input?: unknown;
|
|
191
|
+
output?: unknown;
|
|
192
|
+
id?: string | undefined;
|
|
193
|
+
timestamp?: string | undefined;
|
|
194
|
+
}[] | undefined;
|
|
195
|
+
name?: string | undefined;
|
|
196
|
+
metadata?: Record<string, unknown> | undefined;
|
|
197
|
+
}, {
|
|
198
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
199
|
+
timestamp?: string | undefined;
|
|
200
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
201
|
+
toolCalls?: {
|
|
202
|
+
tool: string;
|
|
203
|
+
input?: unknown;
|
|
204
|
+
output?: unknown;
|
|
205
|
+
id?: string | undefined;
|
|
206
|
+
timestamp?: string | undefined;
|
|
207
|
+
}[] | undefined;
|
|
208
|
+
name?: string | undefined;
|
|
209
|
+
metadata?: Record<string, unknown> | undefined;
|
|
210
|
+
}>, "many">;
|
|
160
211
|
referenceAnswer: z.ZodOptional<z.ZodString>;
|
|
161
212
|
candidateAnswer: z.ZodString;
|
|
162
213
|
outputMessages: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
|
|
163
214
|
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
164
|
-
name: z.ZodOptional<z.ZodString>;
|
|
165
215
|
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
166
216
|
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
167
217
|
tool: z.ZodString;
|
|
@@ -182,12 +232,12 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
182
232
|
id?: string | undefined;
|
|
183
233
|
timestamp?: string | undefined;
|
|
184
234
|
}>, "many">>;
|
|
235
|
+
name: z.ZodOptional<z.ZodString>;
|
|
185
236
|
timestamp: z.ZodOptional<z.ZodString>;
|
|
186
237
|
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
187
238
|
}, "strip", z.ZodTypeAny, {
|
|
188
239
|
role: "tool" | "assistant" | "user" | "system";
|
|
189
240
|
timestamp?: string | undefined;
|
|
190
|
-
name?: string | undefined;
|
|
191
241
|
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
192
242
|
toolCalls?: {
|
|
193
243
|
tool: string;
|
|
@@ -196,11 +246,11 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
196
246
|
id?: string | undefined;
|
|
197
247
|
timestamp?: string | undefined;
|
|
198
248
|
}[] | undefined;
|
|
249
|
+
name?: string | undefined;
|
|
199
250
|
metadata?: Record<string, unknown> | undefined;
|
|
200
251
|
}, {
|
|
201
252
|
role: "tool" | "assistant" | "user" | "system";
|
|
202
253
|
timestamp?: string | undefined;
|
|
203
|
-
name?: string | undefined;
|
|
204
254
|
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
205
255
|
toolCalls?: {
|
|
206
256
|
tool: string;
|
|
@@ -209,19 +259,62 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
209
259
|
id?: string | undefined;
|
|
210
260
|
timestamp?: string | undefined;
|
|
211
261
|
}[] | undefined;
|
|
262
|
+
name?: string | undefined;
|
|
212
263
|
metadata?: Record<string, unknown> | undefined;
|
|
213
264
|
}>, "many">>>;
|
|
214
265
|
guidelineFiles: z.ZodArray<z.ZodString, "many">;
|
|
215
266
|
inputFiles: z.ZodArray<z.ZodString, "many">;
|
|
216
267
|
inputMessages: z.ZodArray<z.ZodObject<{
|
|
217
|
-
role: z.ZodEnum<["
|
|
218
|
-
content: z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]
|
|
268
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
269
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
270
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
271
|
+
tool: z.ZodString;
|
|
272
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
273
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
274
|
+
id: z.ZodOptional<z.ZodString>;
|
|
275
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
276
|
+
}, "strip", z.ZodTypeAny, {
|
|
277
|
+
tool: string;
|
|
278
|
+
input?: unknown;
|
|
279
|
+
output?: unknown;
|
|
280
|
+
id?: string | undefined;
|
|
281
|
+
timestamp?: string | undefined;
|
|
282
|
+
}, {
|
|
283
|
+
tool: string;
|
|
284
|
+
input?: unknown;
|
|
285
|
+
output?: unknown;
|
|
286
|
+
id?: string | undefined;
|
|
287
|
+
timestamp?: string | undefined;
|
|
288
|
+
}>, "many">>;
|
|
289
|
+
name: z.ZodOptional<z.ZodString>;
|
|
290
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
291
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
219
292
|
}, "strip", z.ZodTypeAny, {
|
|
220
293
|
role: "tool" | "assistant" | "user" | "system";
|
|
221
|
-
|
|
294
|
+
timestamp?: string | undefined;
|
|
295
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
296
|
+
toolCalls?: {
|
|
297
|
+
tool: string;
|
|
298
|
+
input?: unknown;
|
|
299
|
+
output?: unknown;
|
|
300
|
+
id?: string | undefined;
|
|
301
|
+
timestamp?: string | undefined;
|
|
302
|
+
}[] | undefined;
|
|
303
|
+
name?: string | undefined;
|
|
304
|
+
metadata?: Record<string, unknown> | undefined;
|
|
222
305
|
}, {
|
|
223
306
|
role: "tool" | "assistant" | "user" | "system";
|
|
224
|
-
|
|
307
|
+
timestamp?: string | undefined;
|
|
308
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
309
|
+
toolCalls?: {
|
|
310
|
+
tool: string;
|
|
311
|
+
input?: unknown;
|
|
312
|
+
output?: unknown;
|
|
313
|
+
id?: string | undefined;
|
|
314
|
+
timestamp?: string | undefined;
|
|
315
|
+
}[] | undefined;
|
|
316
|
+
name?: string | undefined;
|
|
317
|
+
metadata?: Record<string, unknown> | undefined;
|
|
225
318
|
}>, "many">;
|
|
226
319
|
traceSummary: z.ZodOptional<z.ZodNullable<z.ZodObject<{
|
|
227
320
|
eventCount: z.ZodNumber;
|
|
@@ -275,19 +368,41 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
275
368
|
}, "strip", z.ZodTypeAny, {
|
|
276
369
|
question: string;
|
|
277
370
|
expectedOutcome: string;
|
|
278
|
-
expectedMessages:
|
|
371
|
+
expectedMessages: {
|
|
372
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
373
|
+
timestamp?: string | undefined;
|
|
374
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
375
|
+
toolCalls?: {
|
|
376
|
+
tool: string;
|
|
377
|
+
input?: unknown;
|
|
378
|
+
output?: unknown;
|
|
379
|
+
id?: string | undefined;
|
|
380
|
+
timestamp?: string | undefined;
|
|
381
|
+
}[] | undefined;
|
|
382
|
+
name?: string | undefined;
|
|
383
|
+
metadata?: Record<string, unknown> | undefined;
|
|
384
|
+
}[];
|
|
279
385
|
candidateAnswer: string;
|
|
280
386
|
guidelineFiles: string[];
|
|
281
387
|
inputFiles: string[];
|
|
282
388
|
inputMessages: {
|
|
283
389
|
role: "tool" | "assistant" | "user" | "system";
|
|
284
|
-
|
|
390
|
+
timestamp?: string | undefined;
|
|
391
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
392
|
+
toolCalls?: {
|
|
393
|
+
tool: string;
|
|
394
|
+
input?: unknown;
|
|
395
|
+
output?: unknown;
|
|
396
|
+
id?: string | undefined;
|
|
397
|
+
timestamp?: string | undefined;
|
|
398
|
+
}[] | undefined;
|
|
399
|
+
name?: string | undefined;
|
|
400
|
+
metadata?: Record<string, unknown> | undefined;
|
|
285
401
|
}[];
|
|
286
402
|
referenceAnswer?: string | undefined;
|
|
287
403
|
outputMessages?: {
|
|
288
404
|
role: "tool" | "assistant" | "user" | "system";
|
|
289
405
|
timestamp?: string | undefined;
|
|
290
|
-
name?: string | undefined;
|
|
291
406
|
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
292
407
|
toolCalls?: {
|
|
293
408
|
tool: string;
|
|
@@ -296,6 +411,7 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
296
411
|
id?: string | undefined;
|
|
297
412
|
timestamp?: string | undefined;
|
|
298
413
|
}[] | undefined;
|
|
414
|
+
name?: string | undefined;
|
|
299
415
|
metadata?: Record<string, unknown> | undefined;
|
|
300
416
|
}[] | null | undefined;
|
|
301
417
|
traceSummary?: {
|
|
@@ -316,19 +432,41 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
316
432
|
}, {
|
|
317
433
|
question: string;
|
|
318
434
|
expectedOutcome: string;
|
|
319
|
-
expectedMessages:
|
|
435
|
+
expectedMessages: {
|
|
436
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
437
|
+
timestamp?: string | undefined;
|
|
438
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
439
|
+
toolCalls?: {
|
|
440
|
+
tool: string;
|
|
441
|
+
input?: unknown;
|
|
442
|
+
output?: unknown;
|
|
443
|
+
id?: string | undefined;
|
|
444
|
+
timestamp?: string | undefined;
|
|
445
|
+
}[] | undefined;
|
|
446
|
+
name?: string | undefined;
|
|
447
|
+
metadata?: Record<string, unknown> | undefined;
|
|
448
|
+
}[];
|
|
320
449
|
candidateAnswer: string;
|
|
321
450
|
guidelineFiles: string[];
|
|
322
451
|
inputFiles: string[];
|
|
323
452
|
inputMessages: {
|
|
324
453
|
role: "tool" | "assistant" | "user" | "system";
|
|
325
|
-
|
|
454
|
+
timestamp?: string | undefined;
|
|
455
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
456
|
+
toolCalls?: {
|
|
457
|
+
tool: string;
|
|
458
|
+
input?: unknown;
|
|
459
|
+
output?: unknown;
|
|
460
|
+
id?: string | undefined;
|
|
461
|
+
timestamp?: string | undefined;
|
|
462
|
+
}[] | undefined;
|
|
463
|
+
name?: string | undefined;
|
|
464
|
+
metadata?: Record<string, unknown> | undefined;
|
|
326
465
|
}[];
|
|
327
466
|
referenceAnswer?: string | undefined;
|
|
328
467
|
outputMessages?: {
|
|
329
468
|
role: "tool" | "assistant" | "user" | "system";
|
|
330
469
|
timestamp?: string | undefined;
|
|
331
|
-
name?: string | undefined;
|
|
332
470
|
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
333
471
|
toolCalls?: {
|
|
334
472
|
tool: string;
|
|
@@ -337,6 +475,7 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
337
475
|
id?: string | undefined;
|
|
338
476
|
timestamp?: string | undefined;
|
|
339
477
|
}[] | undefined;
|
|
478
|
+
name?: string | undefined;
|
|
340
479
|
metadata?: Record<string, unknown> | undefined;
|
|
341
480
|
}[] | null | undefined;
|
|
342
481
|
traceSummary?: {
|
|
@@ -363,16 +502,20 @@ declare const CodeJudgeResultSchema: z.ZodObject<{
|
|
|
363
502
|
hits: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
|
|
364
503
|
misses: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
|
|
365
504
|
reasoning: z.ZodOptional<z.ZodString>;
|
|
505
|
+
/** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */
|
|
506
|
+
details: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
366
507
|
}, "strip", z.ZodTypeAny, {
|
|
367
508
|
score: number;
|
|
368
509
|
hits: string[];
|
|
369
510
|
misses: string[];
|
|
370
511
|
reasoning?: string | undefined;
|
|
512
|
+
details?: Record<string, unknown> | undefined;
|
|
371
513
|
}, {
|
|
372
514
|
score: number;
|
|
373
515
|
hits?: string[] | undefined;
|
|
374
516
|
misses?: string[] | undefined;
|
|
375
517
|
reasoning?: string | undefined;
|
|
518
|
+
details?: Record<string, unknown> | undefined;
|
|
376
519
|
}>;
|
|
377
520
|
/**
|
|
378
521
|
* Inferred types from schemas.
|
|
@@ -380,10 +523,118 @@ declare const CodeJudgeResultSchema: z.ZodObject<{
|
|
|
380
523
|
type CodeJudgeInput = z.infer<typeof CodeJudgeInputSchema>;
|
|
381
524
|
type CodeJudgeResult = z.infer<typeof CodeJudgeResultSchema>;
|
|
382
525
|
type TraceSummary = z.infer<typeof TraceSummarySchema>;
|
|
383
|
-
type
|
|
526
|
+
type Message = z.infer<typeof MessageSchema>;
|
|
384
527
|
type ToolCall = z.infer<typeof ToolCallSchema>;
|
|
385
528
|
type TokenUsage = z.infer<typeof TokenUsageSchema>;
|
|
386
529
|
|
|
530
|
+
/**
|
|
531
|
+
* Client for invoking configured targets from code_judge scripts.
|
|
532
|
+
*
|
|
533
|
+
* Environment variables (set automatically by AgentV when `target` config is present):
|
|
534
|
+
* - AGENTV_TARGET_PROXY_URL: The URL of the local proxy server
|
|
535
|
+
* - AGENTV_TARGET_PROXY_TOKEN: Bearer token for authentication
|
|
536
|
+
*/
|
|
537
|
+
/**
|
|
538
|
+
* Request to invoke the target
|
|
539
|
+
*/
|
|
540
|
+
interface TargetInvokeRequest {
|
|
541
|
+
readonly question: string;
|
|
542
|
+
readonly systemPrompt?: string;
|
|
543
|
+
readonly evalCaseId?: string;
|
|
544
|
+
readonly attempt?: number;
|
|
545
|
+
/** Optional target override - use a different target for this invocation */
|
|
546
|
+
readonly target?: string;
|
|
547
|
+
}
|
|
548
|
+
/**
|
|
549
|
+
* Response from a target invocation
|
|
550
|
+
*/
|
|
551
|
+
interface TargetInvokeResponse {
|
|
552
|
+
readonly outputMessages: readonly unknown[];
|
|
553
|
+
readonly rawText?: string;
|
|
554
|
+
}
|
|
555
|
+
/**
|
|
556
|
+
* Information about the target proxy configuration
|
|
557
|
+
*/
|
|
558
|
+
interface TargetInfo {
|
|
559
|
+
/** Name of the default target being used */
|
|
560
|
+
readonly targetName: string;
|
|
561
|
+
/** Maximum number of calls allowed */
|
|
562
|
+
readonly maxCalls: number;
|
|
563
|
+
/** Current number of calls made */
|
|
564
|
+
readonly callCount: number;
|
|
565
|
+
/** List of all available target names */
|
|
566
|
+
readonly availableTargets: readonly string[];
|
|
567
|
+
}
|
|
568
|
+
/**
|
|
569
|
+
* Target client for making target invocations
|
|
570
|
+
*/
|
|
571
|
+
interface TargetClient {
|
|
572
|
+
/**
|
|
573
|
+
* Invoke the configured target with a prompt.
|
|
574
|
+
* @param request - The question and optional system prompt
|
|
575
|
+
* @returns The target's response with output messages and optional raw text
|
|
576
|
+
*/
|
|
577
|
+
invoke(request: TargetInvokeRequest): Promise<TargetInvokeResponse>;
|
|
578
|
+
/**
|
|
579
|
+
* Invoke the target with multiple requests in sequence.
|
|
580
|
+
* Each request counts toward the max_calls limit.
|
|
581
|
+
* @param requests - Array of target requests
|
|
582
|
+
* @returns Array of target responses
|
|
583
|
+
*/
|
|
584
|
+
invokeBatch(requests: readonly TargetInvokeRequest[]): Promise<readonly TargetInvokeResponse[]>;
|
|
585
|
+
/**
|
|
586
|
+
* Get information about the target proxy configuration.
|
|
587
|
+
* Returns the default target name, max calls, current call count, and available targets.
|
|
588
|
+
*/
|
|
589
|
+
getInfo(): Promise<TargetInfo>;
|
|
590
|
+
}
|
|
591
|
+
/**
|
|
592
|
+
* Error thrown when target proxy is not available
|
|
593
|
+
*/
|
|
594
|
+
declare class TargetNotAvailableError extends Error {
|
|
595
|
+
constructor(message: string);
|
|
596
|
+
}
|
|
597
|
+
/**
|
|
598
|
+
* Error thrown when target invocation fails
|
|
599
|
+
*/
|
|
600
|
+
declare class TargetInvocationError extends Error {
|
|
601
|
+
readonly statusCode?: number;
|
|
602
|
+
constructor(message: string, statusCode?: number);
|
|
603
|
+
}
|
|
604
|
+
/**
|
|
605
|
+
* Create a target client from environment variables.
|
|
606
|
+
*
|
|
607
|
+
* This function reads the proxy URL and token from environment variables
|
|
608
|
+
* that are automatically set by AgentV when a `target` config block is present
|
|
609
|
+
* on a `code_judge` evaluator.
|
|
610
|
+
*
|
|
611
|
+
* @returns A target client if environment variables are set, otherwise undefined
|
|
612
|
+
* @throws TargetNotAvailableError if token is missing when URL is present
|
|
613
|
+
*
|
|
614
|
+
* @example
|
|
615
|
+
* ```typescript
|
|
616
|
+
* import { createTargetClient, defineCodeJudge } from '@agentv/eval';
|
|
617
|
+
*
|
|
618
|
+
* export default defineCodeJudge(async ({ question, expectedOutcome }) => {
|
|
619
|
+
* const target = createTargetClient();
|
|
620
|
+
*
|
|
621
|
+
* if (!target) {
|
|
622
|
+
* // Target not available - no target config on this evaluator
|
|
623
|
+
* return { score: 0.5, reasoning: 'Target not available' };
|
|
624
|
+
* }
|
|
625
|
+
*
|
|
626
|
+
* const response = await target.invoke({
|
|
627
|
+
* question: `Is this answer correct? Question: ${question}, Expected: ${expectedOutcome}`,
|
|
628
|
+
* systemPrompt: 'You are an expert evaluator. Respond with JSON: { "correct": true/false }'
|
|
629
|
+
* });
|
|
630
|
+
*
|
|
631
|
+
* const result = JSON.parse(response.rawText ?? '{}');
|
|
632
|
+
* return { score: result.correct ? 1.0 : 0.0 };
|
|
633
|
+
* });
|
|
634
|
+
* ```
|
|
635
|
+
*/
|
|
636
|
+
declare function createTargetClient(): TargetClient | undefined;
|
|
637
|
+
|
|
387
638
|
/**
|
|
388
639
|
* Handler function type for code judges.
|
|
389
640
|
*/
|
|
@@ -394,7 +645,7 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
|
|
|
394
645
|
*
|
|
395
646
|
* Build custom code judges for evaluating AI agent outputs.
|
|
396
647
|
*
|
|
397
|
-
* @example
|
|
648
|
+
* @example Basic code judge
|
|
398
649
|
* ```typescript
|
|
399
650
|
* #!/usr/bin/env bun
|
|
400
651
|
* import { defineCodeJudge } from '@agentv/eval';
|
|
@@ -406,6 +657,27 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
|
|
|
406
657
|
* }));
|
|
407
658
|
* ```
|
|
408
659
|
*
|
|
660
|
+
* @example Code judge with target access (requires `target` config in YAML)
|
|
661
|
+
* ```typescript
|
|
662
|
+
* #!/usr/bin/env bun
|
|
663
|
+
* import { defineCodeJudge, createTargetClient } from '@agentv/eval';
|
|
664
|
+
*
|
|
665
|
+
* export default defineCodeJudge(async ({ question }) => {
|
|
666
|
+
* const target = createTargetClient();
|
|
667
|
+
* if (!target) {
|
|
668
|
+
* return { score: 0, misses: ['Target not available'] };
|
|
669
|
+
* }
|
|
670
|
+
*
|
|
671
|
+
* const response = await target.invoke({
|
|
672
|
+
* question: `Evaluate: ${question}`,
|
|
673
|
+
* systemPrompt: 'Respond with JSON: { "score": 0-1 }'
|
|
674
|
+
* });
|
|
675
|
+
*
|
|
676
|
+
* const result = JSON.parse(response.rawText ?? '{}');
|
|
677
|
+
* return { score: result.score ?? 0 };
|
|
678
|
+
* });
|
|
679
|
+
* ```
|
|
680
|
+
*
|
|
409
681
|
* @packageDocumentation
|
|
410
682
|
*/
|
|
411
683
|
|
|
@@ -455,4 +727,4 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
|
|
|
455
727
|
*/
|
|
456
728
|
declare function defineCodeJudge(handler: CodeJudgeHandler): void;
|
|
457
729
|
|
|
458
|
-
export { type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type
|
|
730
|
+
export { type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type Message, MessageSchema, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, createTargetClient, defineCodeJudge };
|
package/dist/index.js
CHANGED
|
@@ -22,29 +22,24 @@ var ToolCallSchema = z.object({
|
|
|
22
22
|
id: z.string().optional(),
|
|
23
23
|
timestamp: z.string().optional()
|
|
24
24
|
});
|
|
25
|
-
var
|
|
25
|
+
var MessageSchema = z.object({
|
|
26
26
|
role: z.enum(["assistant", "user", "system", "tool"]),
|
|
27
|
-
// Optional message name (e.g., agent name) used by some providers for multi-agent transcripts.
|
|
28
|
-
name: z.string().optional(),
|
|
29
27
|
content: z.union([z.string(), z.record(z.unknown()), z.array(z.record(z.unknown()))]).optional(),
|
|
30
28
|
toolCalls: z.array(ToolCallSchema).optional(),
|
|
29
|
+
name: z.string().optional(),
|
|
31
30
|
timestamp: z.string().optional(),
|
|
32
31
|
metadata: z.record(z.unknown()).optional()
|
|
33
32
|
});
|
|
34
|
-
var TestMessageSchema = z.object({
|
|
35
|
-
role: z.enum(["system", "user", "assistant", "tool"]),
|
|
36
|
-
content: z.union([z.string(), z.record(z.unknown()), z.array(z.record(z.unknown()))])
|
|
37
|
-
});
|
|
38
33
|
var CodeJudgeInputSchema = z.object({
|
|
39
34
|
question: z.string(),
|
|
40
35
|
expectedOutcome: z.string(),
|
|
41
|
-
expectedMessages: z.array(
|
|
36
|
+
expectedMessages: z.array(MessageSchema),
|
|
42
37
|
referenceAnswer: z.string().optional(),
|
|
43
38
|
candidateAnswer: z.string(),
|
|
44
|
-
outputMessages: z.array(
|
|
39
|
+
outputMessages: z.array(MessageSchema).nullable().optional(),
|
|
45
40
|
guidelineFiles: z.array(z.string()),
|
|
46
41
|
inputFiles: z.array(z.string()),
|
|
47
|
-
inputMessages: z.array(
|
|
42
|
+
inputMessages: z.array(MessageSchema),
|
|
48
43
|
traceSummary: TraceSummarySchema.nullable().optional(),
|
|
49
44
|
config: z.record(z.unknown()).nullable().optional()
|
|
50
45
|
});
|
|
@@ -52,9 +47,119 @@ var CodeJudgeResultSchema = z.object({
|
|
|
52
47
|
score: z.number().min(0).max(1),
|
|
53
48
|
hits: z.array(z.string()).optional().default([]),
|
|
54
49
|
misses: z.array(z.string()).optional().default([]),
|
|
55
|
-
reasoning: z.string().optional()
|
|
50
|
+
reasoning: z.string().optional(),
|
|
51
|
+
/** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */
|
|
52
|
+
details: z.record(z.unknown()).optional()
|
|
56
53
|
});
|
|
57
54
|
|
|
55
|
+
// src/target-client.ts
|
|
56
|
+
var TargetNotAvailableError = class extends Error {
|
|
57
|
+
constructor(message) {
|
|
58
|
+
super(message);
|
|
59
|
+
this.name = "TargetNotAvailableError";
|
|
60
|
+
}
|
|
61
|
+
};
|
|
62
|
+
var TargetInvocationError = class extends Error {
|
|
63
|
+
statusCode;
|
|
64
|
+
constructor(message, statusCode) {
|
|
65
|
+
super(message);
|
|
66
|
+
this.name = "TargetInvocationError";
|
|
67
|
+
this.statusCode = statusCode;
|
|
68
|
+
}
|
|
69
|
+
};
|
|
70
|
+
function createTargetClient() {
|
|
71
|
+
const proxyUrl = process.env.AGENTV_TARGET_PROXY_URL;
|
|
72
|
+
const proxyToken = process.env.AGENTV_TARGET_PROXY_TOKEN;
|
|
73
|
+
if (!proxyUrl) {
|
|
74
|
+
return void 0;
|
|
75
|
+
}
|
|
76
|
+
if (!proxyToken) {
|
|
77
|
+
throw new TargetNotAvailableError(
|
|
78
|
+
"AGENTV_TARGET_PROXY_URL is set but AGENTV_TARGET_PROXY_TOKEN is missing"
|
|
79
|
+
);
|
|
80
|
+
}
|
|
81
|
+
return createTargetClientInternal(proxyUrl, proxyToken);
|
|
82
|
+
}
|
|
83
|
+
function createTargetClientInternal(url, token) {
|
|
84
|
+
const headers = {
|
|
85
|
+
"Content-Type": "application/json",
|
|
86
|
+
Authorization: `Bearer ${token}`
|
|
87
|
+
};
|
|
88
|
+
return {
|
|
89
|
+
async invoke(request) {
|
|
90
|
+
const response = await fetch(`${url}/invoke`, {
|
|
91
|
+
method: "POST",
|
|
92
|
+
headers,
|
|
93
|
+
body: JSON.stringify({
|
|
94
|
+
question: request.question,
|
|
95
|
+
systemPrompt: request.systemPrompt,
|
|
96
|
+
evalCaseId: request.evalCaseId,
|
|
97
|
+
attempt: request.attempt,
|
|
98
|
+
target: request.target
|
|
99
|
+
})
|
|
100
|
+
});
|
|
101
|
+
if (!response.ok) {
|
|
102
|
+
const errorBody = await response.text();
|
|
103
|
+
let errorMessage;
|
|
104
|
+
try {
|
|
105
|
+
const errorJson = JSON.parse(errorBody);
|
|
106
|
+
errorMessage = errorJson.error ?? `HTTP ${response.status}`;
|
|
107
|
+
} catch {
|
|
108
|
+
errorMessage = errorBody || `HTTP ${response.status}`;
|
|
109
|
+
}
|
|
110
|
+
throw new TargetInvocationError(errorMessage, response.status);
|
|
111
|
+
}
|
|
112
|
+
return await response.json();
|
|
113
|
+
},
|
|
114
|
+
async invokeBatch(requests) {
|
|
115
|
+
const response = await fetch(`${url}/invokeBatch`, {
|
|
116
|
+
method: "POST",
|
|
117
|
+
headers,
|
|
118
|
+
body: JSON.stringify({
|
|
119
|
+
requests: requests.map((r) => ({
|
|
120
|
+
question: r.question,
|
|
121
|
+
systemPrompt: r.systemPrompt,
|
|
122
|
+
evalCaseId: r.evalCaseId,
|
|
123
|
+
attempt: r.attempt,
|
|
124
|
+
target: r.target
|
|
125
|
+
}))
|
|
126
|
+
})
|
|
127
|
+
});
|
|
128
|
+
if (!response.ok) {
|
|
129
|
+
const errorBody = await response.text();
|
|
130
|
+
let errorMessage;
|
|
131
|
+
try {
|
|
132
|
+
const errorJson = JSON.parse(errorBody);
|
|
133
|
+
errorMessage = errorJson.error ?? `HTTP ${response.status}`;
|
|
134
|
+
} catch {
|
|
135
|
+
errorMessage = errorBody || `HTTP ${response.status}`;
|
|
136
|
+
}
|
|
137
|
+
throw new TargetInvocationError(errorMessage, response.status);
|
|
138
|
+
}
|
|
139
|
+
const result = await response.json();
|
|
140
|
+
return result.responses;
|
|
141
|
+
},
|
|
142
|
+
async getInfo() {
|
|
143
|
+
const response = await fetch(`${url}/info`, {
|
|
144
|
+
method: "GET",
|
|
145
|
+
headers
|
|
146
|
+
});
|
|
147
|
+
if (!response.ok) {
|
|
148
|
+
const errorBody = await response.text();
|
|
149
|
+
let errorMessage;
|
|
150
|
+
try {
|
|
151
|
+
const errorJson = JSON.parse(errorBody);
|
|
152
|
+
errorMessage = errorJson.error ?? `HTTP ${response.status}`;
|
|
153
|
+
} catch {
|
|
154
|
+
errorMessage = errorBody || `HTTP ${response.status}`;
|
|
155
|
+
}
|
|
156
|
+
throw new TargetInvocationError(errorMessage, response.status);
|
|
157
|
+
}
|
|
158
|
+
return await response.json();
|
|
159
|
+
}
|
|
160
|
+
};
|
|
161
|
+
}
|
|
162
|
+
|
|
58
163
|
// src/index.ts
|
|
59
164
|
import { z as z2 } from "zod";
|
|
60
165
|
|
|
@@ -134,10 +239,13 @@ function defineCodeJudge(handler) {
|
|
|
134
239
|
export {
|
|
135
240
|
CodeJudgeInputSchema,
|
|
136
241
|
CodeJudgeResultSchema,
|
|
137
|
-
|
|
242
|
+
MessageSchema,
|
|
243
|
+
TargetInvocationError,
|
|
244
|
+
TargetNotAvailableError,
|
|
138
245
|
TokenUsageSchema,
|
|
139
246
|
ToolCallSchema,
|
|
140
247
|
TraceSummarySchema,
|
|
248
|
+
createTargetClient,
|
|
141
249
|
defineCodeJudge,
|
|
142
250
|
z2 as z
|
|
143
251
|
};
|