@agentv/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -74,7 +74,7 @@ declare const TraceSummarySchema: z.ZodObject<{
74
74
  toolDurations?: Record<string, number[]> | undefined;
75
75
  }>;
76
76
  /**
77
- * Tool call schema for output messages.
77
+ * Tool call schema.
78
78
  */
79
79
  declare const ToolCallSchema: z.ZodObject<{
80
80
  tool: z.ZodString;
@@ -96,9 +96,9 @@ declare const ToolCallSchema: z.ZodObject<{
96
96
  timestamp?: string | undefined;
97
97
  }>;
98
98
  /**
99
- * Output message schema.
99
+ * Unified message schema for input, expected, and output messages.
100
100
  */
101
- declare const OutputMessageSchema: z.ZodObject<{
101
+ declare const MessageSchema: z.ZodObject<{
102
102
  role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
103
103
  content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
104
104
  toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
@@ -120,6 +120,7 @@ declare const OutputMessageSchema: z.ZodObject<{
120
120
  id?: string | undefined;
121
121
  timestamp?: string | undefined;
122
122
  }>, "many">>;
123
+ name: z.ZodOptional<z.ZodString>;
123
124
  timestamp: z.ZodOptional<z.ZodString>;
124
125
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
125
126
  }, "strip", z.ZodTypeAny, {
@@ -133,6 +134,7 @@ declare const OutputMessageSchema: z.ZodObject<{
133
134
  id?: string | undefined;
134
135
  timestamp?: string | undefined;
135
136
  }[] | undefined;
137
+ name?: string | undefined;
136
138
  metadata?: Record<string, unknown> | undefined;
137
139
  }, {
138
140
  role: "tool" | "assistant" | "user" | "system";
@@ -145,6 +147,7 @@ declare const OutputMessageSchema: z.ZodObject<{
145
147
  id?: string | undefined;
146
148
  timestamp?: string | undefined;
147
149
  }[] | undefined;
150
+ name?: string | undefined;
148
151
  metadata?: Record<string, unknown> | undefined;
149
152
  }>;
150
153
  /**
@@ -153,7 +156,58 @@ declare const OutputMessageSchema: z.ZodObject<{
153
156
  declare const CodeJudgeInputSchema: z.ZodObject<{
154
157
  question: z.ZodString;
155
158
  expectedOutcome: z.ZodString;
156
- expectedMessages: z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">;
159
+ expectedMessages: z.ZodArray<z.ZodObject<{
160
+ role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
161
+ content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
162
+ toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
163
+ tool: z.ZodString;
164
+ input: z.ZodOptional<z.ZodUnknown>;
165
+ output: z.ZodOptional<z.ZodUnknown>;
166
+ id: z.ZodOptional<z.ZodString>;
167
+ timestamp: z.ZodOptional<z.ZodString>;
168
+ }, "strip", z.ZodTypeAny, {
169
+ tool: string;
170
+ input?: unknown;
171
+ output?: unknown;
172
+ id?: string | undefined;
173
+ timestamp?: string | undefined;
174
+ }, {
175
+ tool: string;
176
+ input?: unknown;
177
+ output?: unknown;
178
+ id?: string | undefined;
179
+ timestamp?: string | undefined;
180
+ }>, "many">>;
181
+ name: z.ZodOptional<z.ZodString>;
182
+ timestamp: z.ZodOptional<z.ZodString>;
183
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
184
+ }, "strip", z.ZodTypeAny, {
185
+ role: "tool" | "assistant" | "user" | "system";
186
+ timestamp?: string | undefined;
187
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
188
+ toolCalls?: {
189
+ tool: string;
190
+ input?: unknown;
191
+ output?: unknown;
192
+ id?: string | undefined;
193
+ timestamp?: string | undefined;
194
+ }[] | undefined;
195
+ name?: string | undefined;
196
+ metadata?: Record<string, unknown> | undefined;
197
+ }, {
198
+ role: "tool" | "assistant" | "user" | "system";
199
+ timestamp?: string | undefined;
200
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
201
+ toolCalls?: {
202
+ tool: string;
203
+ input?: unknown;
204
+ output?: unknown;
205
+ id?: string | undefined;
206
+ timestamp?: string | undefined;
207
+ }[] | undefined;
208
+ name?: string | undefined;
209
+ metadata?: Record<string, unknown> | undefined;
210
+ }>, "many">;
157
211
  referenceAnswer: z.ZodOptional<z.ZodString>;
158
212
  candidateAnswer: z.ZodString;
159
213
  outputMessages: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
@@ -178,6 +232,7 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
178
232
  id?: string | undefined;
179
233
  timestamp?: string | undefined;
180
234
  }>, "many">>;
235
+ name: z.ZodOptional<z.ZodString>;
181
236
  timestamp: z.ZodOptional<z.ZodString>;
182
237
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
183
238
  }, "strip", z.ZodTypeAny, {
@@ -191,6 +246,7 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
191
246
  id?: string | undefined;
192
247
  timestamp?: string | undefined;
193
248
  }[] | undefined;
249
+ name?: string | undefined;
194
250
  metadata?: Record<string, unknown> | undefined;
195
251
  }, {
196
252
  role: "tool" | "assistant" | "user" | "system";
@@ -203,19 +259,62 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
203
259
  id?: string | undefined;
204
260
  timestamp?: string | undefined;
205
261
  }[] | undefined;
262
+ name?: string | undefined;
206
263
  metadata?: Record<string, unknown> | undefined;
207
264
  }>, "many">>>;
208
265
  guidelineFiles: z.ZodArray<z.ZodString, "many">;
209
266
  inputFiles: z.ZodArray<z.ZodString, "many">;
210
267
  inputMessages: z.ZodArray<z.ZodObject<{
211
- role: z.ZodEnum<["system", "user", "assistant", "tool"]>;
212
- content: z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>;
268
+ role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
269
+ content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
270
+ toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
271
+ tool: z.ZodString;
272
+ input: z.ZodOptional<z.ZodUnknown>;
273
+ output: z.ZodOptional<z.ZodUnknown>;
274
+ id: z.ZodOptional<z.ZodString>;
275
+ timestamp: z.ZodOptional<z.ZodString>;
276
+ }, "strip", z.ZodTypeAny, {
277
+ tool: string;
278
+ input?: unknown;
279
+ output?: unknown;
280
+ id?: string | undefined;
281
+ timestamp?: string | undefined;
282
+ }, {
283
+ tool: string;
284
+ input?: unknown;
285
+ output?: unknown;
286
+ id?: string | undefined;
287
+ timestamp?: string | undefined;
288
+ }>, "many">>;
289
+ name: z.ZodOptional<z.ZodString>;
290
+ timestamp: z.ZodOptional<z.ZodString>;
291
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
213
292
  }, "strip", z.ZodTypeAny, {
214
293
  role: "tool" | "assistant" | "user" | "system";
215
- content: string | Record<string, unknown> | Record<string, unknown>[];
294
+ timestamp?: string | undefined;
295
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
296
+ toolCalls?: {
297
+ tool: string;
298
+ input?: unknown;
299
+ output?: unknown;
300
+ id?: string | undefined;
301
+ timestamp?: string | undefined;
302
+ }[] | undefined;
303
+ name?: string | undefined;
304
+ metadata?: Record<string, unknown> | undefined;
216
305
  }, {
217
306
  role: "tool" | "assistant" | "user" | "system";
218
- content: string | Record<string, unknown> | Record<string, unknown>[];
307
+ timestamp?: string | undefined;
308
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
309
+ toolCalls?: {
310
+ tool: string;
311
+ input?: unknown;
312
+ output?: unknown;
313
+ id?: string | undefined;
314
+ timestamp?: string | undefined;
315
+ }[] | undefined;
316
+ name?: string | undefined;
317
+ metadata?: Record<string, unknown> | undefined;
219
318
  }>, "many">;
220
319
  traceSummary: z.ZodOptional<z.ZodNullable<z.ZodObject<{
221
320
  eventCount: z.ZodNumber;
@@ -269,13 +368,36 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
269
368
  }, "strip", z.ZodTypeAny, {
270
369
  question: string;
271
370
  expectedOutcome: string;
272
- expectedMessages: Record<string, unknown>[];
371
+ expectedMessages: {
372
+ role: "tool" | "assistant" | "user" | "system";
373
+ timestamp?: string | undefined;
374
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
375
+ toolCalls?: {
376
+ tool: string;
377
+ input?: unknown;
378
+ output?: unknown;
379
+ id?: string | undefined;
380
+ timestamp?: string | undefined;
381
+ }[] | undefined;
382
+ name?: string | undefined;
383
+ metadata?: Record<string, unknown> | undefined;
384
+ }[];
273
385
  candidateAnswer: string;
274
386
  guidelineFiles: string[];
275
387
  inputFiles: string[];
276
388
  inputMessages: {
277
389
  role: "tool" | "assistant" | "user" | "system";
278
- content: string | Record<string, unknown> | Record<string, unknown>[];
390
+ timestamp?: string | undefined;
391
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
392
+ toolCalls?: {
393
+ tool: string;
394
+ input?: unknown;
395
+ output?: unknown;
396
+ id?: string | undefined;
397
+ timestamp?: string | undefined;
398
+ }[] | undefined;
399
+ name?: string | undefined;
400
+ metadata?: Record<string, unknown> | undefined;
279
401
  }[];
280
402
  referenceAnswer?: string | undefined;
281
403
  outputMessages?: {
@@ -289,6 +411,7 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
289
411
  id?: string | undefined;
290
412
  timestamp?: string | undefined;
291
413
  }[] | undefined;
414
+ name?: string | undefined;
292
415
  metadata?: Record<string, unknown> | undefined;
293
416
  }[] | null | undefined;
294
417
  traceSummary?: {
@@ -309,13 +432,36 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
309
432
  }, {
310
433
  question: string;
311
434
  expectedOutcome: string;
312
- expectedMessages: Record<string, unknown>[];
435
+ expectedMessages: {
436
+ role: "tool" | "assistant" | "user" | "system";
437
+ timestamp?: string | undefined;
438
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
439
+ toolCalls?: {
440
+ tool: string;
441
+ input?: unknown;
442
+ output?: unknown;
443
+ id?: string | undefined;
444
+ timestamp?: string | undefined;
445
+ }[] | undefined;
446
+ name?: string | undefined;
447
+ metadata?: Record<string, unknown> | undefined;
448
+ }[];
313
449
  candidateAnswer: string;
314
450
  guidelineFiles: string[];
315
451
  inputFiles: string[];
316
452
  inputMessages: {
317
453
  role: "tool" | "assistant" | "user" | "system";
318
- content: string | Record<string, unknown> | Record<string, unknown>[];
454
+ timestamp?: string | undefined;
455
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
456
+ toolCalls?: {
457
+ tool: string;
458
+ input?: unknown;
459
+ output?: unknown;
460
+ id?: string | undefined;
461
+ timestamp?: string | undefined;
462
+ }[] | undefined;
463
+ name?: string | undefined;
464
+ metadata?: Record<string, unknown> | undefined;
319
465
  }[];
320
466
  referenceAnswer?: string | undefined;
321
467
  outputMessages?: {
@@ -329,6 +475,7 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
329
475
  id?: string | undefined;
330
476
  timestamp?: string | undefined;
331
477
  }[] | undefined;
478
+ name?: string | undefined;
332
479
  metadata?: Record<string, unknown> | undefined;
333
480
  }[] | null | undefined;
334
481
  traceSummary?: {
@@ -355,16 +502,20 @@ declare const CodeJudgeResultSchema: z.ZodObject<{
355
502
  hits: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
356
503
  misses: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
357
504
  reasoning: z.ZodOptional<z.ZodString>;
505
+ /** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */
506
+ details: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
358
507
  }, "strip", z.ZodTypeAny, {
359
508
  score: number;
360
509
  hits: string[];
361
510
  misses: string[];
362
511
  reasoning?: string | undefined;
512
+ details?: Record<string, unknown> | undefined;
363
513
  }, {
364
514
  score: number;
365
515
  hits?: string[] | undefined;
366
516
  misses?: string[] | undefined;
367
517
  reasoning?: string | undefined;
518
+ details?: Record<string, unknown> | undefined;
368
519
  }>;
369
520
  /**
370
521
  * Inferred types from schemas.
@@ -372,10 +523,118 @@ declare const CodeJudgeResultSchema: z.ZodObject<{
372
523
  type CodeJudgeInput = z.infer<typeof CodeJudgeInputSchema>;
373
524
  type CodeJudgeResult = z.infer<typeof CodeJudgeResultSchema>;
374
525
  type TraceSummary = z.infer<typeof TraceSummarySchema>;
375
- type OutputMessage = z.infer<typeof OutputMessageSchema>;
526
+ type Message = z.infer<typeof MessageSchema>;
376
527
  type ToolCall = z.infer<typeof ToolCallSchema>;
377
528
  type TokenUsage = z.infer<typeof TokenUsageSchema>;
378
529
 
530
+ /**
531
+ * Client for invoking configured targets from code_judge scripts.
532
+ *
533
+ * Environment variables (set automatically by AgentV when `target` config is present):
534
+ * - AGENTV_TARGET_PROXY_URL: The URL of the local proxy server
535
+ * - AGENTV_TARGET_PROXY_TOKEN: Bearer token for authentication
536
+ */
537
+ /**
538
+ * Request to invoke the target
539
+ */
540
+ interface TargetInvokeRequest {
541
+ readonly question: string;
542
+ readonly systemPrompt?: string;
543
+ readonly evalCaseId?: string;
544
+ readonly attempt?: number;
545
+ /** Optional target override - use a different target for this invocation */
546
+ readonly target?: string;
547
+ }
548
+ /**
549
+ * Response from a target invocation
550
+ */
551
+ interface TargetInvokeResponse {
552
+ readonly outputMessages: readonly unknown[];
553
+ readonly rawText?: string;
554
+ }
555
+ /**
556
+ * Information about the target proxy configuration
557
+ */
558
+ interface TargetInfo {
559
+ /** Name of the default target being used */
560
+ readonly targetName: string;
561
+ /** Maximum number of calls allowed */
562
+ readonly maxCalls: number;
563
+ /** Current number of calls made */
564
+ readonly callCount: number;
565
+ /** List of all available target names */
566
+ readonly availableTargets: readonly string[];
567
+ }
568
+ /**
569
+ * Target client for making target invocations
570
+ */
571
+ interface TargetClient {
572
+ /**
573
+ * Invoke the configured target with a prompt.
574
+ * @param request - The question and optional system prompt
575
+ * @returns The target's response with output messages and optional raw text
576
+ */
577
+ invoke(request: TargetInvokeRequest): Promise<TargetInvokeResponse>;
578
+ /**
579
+ * Invoke the target with multiple requests in sequence.
580
+ * Each request counts toward the max_calls limit.
581
+ * @param requests - Array of target requests
582
+ * @returns Array of target responses
583
+ */
584
+ invokeBatch(requests: readonly TargetInvokeRequest[]): Promise<readonly TargetInvokeResponse[]>;
585
+ /**
586
+ * Get information about the target proxy configuration.
587
+ * Returns the default target name, max calls, current call count, and available targets.
588
+ */
589
+ getInfo(): Promise<TargetInfo>;
590
+ }
591
+ /**
592
+ * Error thrown when target proxy is not available
593
+ */
594
+ declare class TargetNotAvailableError extends Error {
595
+ constructor(message: string);
596
+ }
597
+ /**
598
+ * Error thrown when target invocation fails
599
+ */
600
+ declare class TargetInvocationError extends Error {
601
+ readonly statusCode?: number;
602
+ constructor(message: string, statusCode?: number);
603
+ }
604
+ /**
605
+ * Create a target client from environment variables.
606
+ *
607
+ * This function reads the proxy URL and token from environment variables
608
+ * that are automatically set by AgentV when a `target` config block is present
609
+ * on a `code_judge` evaluator.
610
+ *
611
+ * @returns A target client if environment variables are set, otherwise undefined
612
+ * @throws TargetNotAvailableError if token is missing when URL is present
613
+ *
614
+ * @example
615
+ * ```typescript
616
+ * import { createTargetClient, defineCodeJudge } from '@agentv/eval';
617
+ *
618
+ * export default defineCodeJudge(async ({ question, expectedOutcome }) => {
619
+ * const target = createTargetClient();
620
+ *
621
+ * if (!target) {
622
+ * // Target not available - no target config on this evaluator
623
+ * return { score: 0.5, reasoning: 'Target not available' };
624
+ * }
625
+ *
626
+ * const response = await target.invoke({
627
+ * question: `Is this answer correct? Question: ${question}, Expected: ${expectedOutcome}`,
628
+ * systemPrompt: 'You are an expert evaluator. Respond with JSON: { "correct": true/false }'
629
+ * });
630
+ *
631
+ * const result = JSON.parse(response.rawText ?? '{}');
632
+ * return { score: result.correct ? 1.0 : 0.0 };
633
+ * });
634
+ * ```
635
+ */
636
+ declare function createTargetClient(): TargetClient | undefined;
637
+
379
638
  /**
380
639
  * Handler function type for code judges.
381
640
  */
@@ -386,7 +645,7 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
386
645
  *
387
646
  * Build custom code judges for evaluating AI agent outputs.
388
647
  *
389
- * @example
648
+ * @example Basic code judge
390
649
  * ```typescript
391
650
  * #!/usr/bin/env bun
392
651
  * import { defineCodeJudge } from '@agentv/eval';
@@ -398,6 +657,27 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
398
657
  * }));
399
658
  * ```
400
659
  *
660
+ * @example Code judge with target access (requires `target` config in YAML)
661
+ * ```typescript
662
+ * #!/usr/bin/env bun
663
+ * import { defineCodeJudge, createTargetClient } from '@agentv/eval';
664
+ *
665
+ * export default defineCodeJudge(async ({ question }) => {
666
+ * const target = createTargetClient();
667
+ * if (!target) {
668
+ * return { score: 0, misses: ['Target not available'] };
669
+ * }
670
+ *
671
+ * const response = await target.invoke({
672
+ * question: `Evaluate: ${question}`,
673
+ * systemPrompt: 'Respond with JSON: { "score": 0-1 }'
674
+ * });
675
+ *
676
+ * const result = JSON.parse(response.rawText ?? '{}');
677
+ * return { score: result.score ?? 0 };
678
+ * });
679
+ * ```
680
+ *
401
681
  * @packageDocumentation
402
682
  */
403
683
 
@@ -447,4 +727,4 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
447
727
  */
448
728
  declare function defineCodeJudge(handler: CodeJudgeHandler): void;
449
729
 
450
- export { type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type OutputMessage, OutputMessageSchema, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, defineCodeJudge };
730
+ export { type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type Message, MessageSchema, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, createTargetClient, defineCodeJudge };