@agentv/eval 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -74,7 +74,7 @@ declare const TraceSummarySchema: z.ZodObject<{
74
74
  toolDurations?: Record<string, number[]> | undefined;
75
75
  }>;
76
76
  /**
77
- * Tool call schema for output messages.
77
+ * Tool call schema.
78
78
  */
79
79
  declare const ToolCallSchema: z.ZodObject<{
80
80
  tool: z.ZodString;
@@ -96,11 +96,10 @@ declare const ToolCallSchema: z.ZodObject<{
96
96
  timestamp?: string | undefined;
97
97
  }>;
98
98
  /**
99
- * Output message schema.
99
+ * Unified message schema for input, expected, and output messages.
100
100
  */
101
- declare const OutputMessageSchema: z.ZodObject<{
101
+ declare const MessageSchema: z.ZodObject<{
102
102
  role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
103
- name: z.ZodOptional<z.ZodString>;
104
103
  content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
105
104
  toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
106
105
  tool: z.ZodString;
@@ -121,12 +120,12 @@ declare const OutputMessageSchema: z.ZodObject<{
121
120
  id?: string | undefined;
122
121
  timestamp?: string | undefined;
123
122
  }>, "many">>;
123
+ name: z.ZodOptional<z.ZodString>;
124
124
  timestamp: z.ZodOptional<z.ZodString>;
125
125
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
126
126
  }, "strip", z.ZodTypeAny, {
127
127
  role: "tool" | "assistant" | "user" | "system";
128
128
  timestamp?: string | undefined;
129
- name?: string | undefined;
130
129
  content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
131
130
  toolCalls?: {
132
131
  tool: string;
@@ -135,11 +134,11 @@ declare const OutputMessageSchema: z.ZodObject<{
135
134
  id?: string | undefined;
136
135
  timestamp?: string | undefined;
137
136
  }[] | undefined;
137
+ name?: string | undefined;
138
138
  metadata?: Record<string, unknown> | undefined;
139
139
  }, {
140
140
  role: "tool" | "assistant" | "user" | "system";
141
141
  timestamp?: string | undefined;
142
- name?: string | undefined;
143
142
  content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
144
143
  toolCalls?: {
145
144
  tool: string;
@@ -148,6 +147,7 @@ declare const OutputMessageSchema: z.ZodObject<{
148
147
  id?: string | undefined;
149
148
  timestamp?: string | undefined;
150
149
  }[] | undefined;
150
+ name?: string | undefined;
151
151
  metadata?: Record<string, unknown> | undefined;
152
152
  }>;
153
153
  /**
@@ -156,12 +156,62 @@ declare const OutputMessageSchema: z.ZodObject<{
156
156
  declare const CodeJudgeInputSchema: z.ZodObject<{
157
157
  question: z.ZodString;
158
158
  expectedOutcome: z.ZodString;
159
- expectedMessages: z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">;
159
+ expectedMessages: z.ZodArray<z.ZodObject<{
160
+ role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
161
+ content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
162
+ toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
163
+ tool: z.ZodString;
164
+ input: z.ZodOptional<z.ZodUnknown>;
165
+ output: z.ZodOptional<z.ZodUnknown>;
166
+ id: z.ZodOptional<z.ZodString>;
167
+ timestamp: z.ZodOptional<z.ZodString>;
168
+ }, "strip", z.ZodTypeAny, {
169
+ tool: string;
170
+ input?: unknown;
171
+ output?: unknown;
172
+ id?: string | undefined;
173
+ timestamp?: string | undefined;
174
+ }, {
175
+ tool: string;
176
+ input?: unknown;
177
+ output?: unknown;
178
+ id?: string | undefined;
179
+ timestamp?: string | undefined;
180
+ }>, "many">>;
181
+ name: z.ZodOptional<z.ZodString>;
182
+ timestamp: z.ZodOptional<z.ZodString>;
183
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
184
+ }, "strip", z.ZodTypeAny, {
185
+ role: "tool" | "assistant" | "user" | "system";
186
+ timestamp?: string | undefined;
187
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
188
+ toolCalls?: {
189
+ tool: string;
190
+ input?: unknown;
191
+ output?: unknown;
192
+ id?: string | undefined;
193
+ timestamp?: string | undefined;
194
+ }[] | undefined;
195
+ name?: string | undefined;
196
+ metadata?: Record<string, unknown> | undefined;
197
+ }, {
198
+ role: "tool" | "assistant" | "user" | "system";
199
+ timestamp?: string | undefined;
200
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
201
+ toolCalls?: {
202
+ tool: string;
203
+ input?: unknown;
204
+ output?: unknown;
205
+ id?: string | undefined;
206
+ timestamp?: string | undefined;
207
+ }[] | undefined;
208
+ name?: string | undefined;
209
+ metadata?: Record<string, unknown> | undefined;
210
+ }>, "many">;
160
211
  referenceAnswer: z.ZodOptional<z.ZodString>;
161
212
  candidateAnswer: z.ZodString;
162
213
  outputMessages: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
163
214
  role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
164
- name: z.ZodOptional<z.ZodString>;
165
215
  content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
166
216
  toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
167
217
  tool: z.ZodString;
@@ -182,12 +232,12 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
182
232
  id?: string | undefined;
183
233
  timestamp?: string | undefined;
184
234
  }>, "many">>;
235
+ name: z.ZodOptional<z.ZodString>;
185
236
  timestamp: z.ZodOptional<z.ZodString>;
186
237
  metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
187
238
  }, "strip", z.ZodTypeAny, {
188
239
  role: "tool" | "assistant" | "user" | "system";
189
240
  timestamp?: string | undefined;
190
- name?: string | undefined;
191
241
  content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
192
242
  toolCalls?: {
193
243
  tool: string;
@@ -196,11 +246,11 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
196
246
  id?: string | undefined;
197
247
  timestamp?: string | undefined;
198
248
  }[] | undefined;
249
+ name?: string | undefined;
199
250
  metadata?: Record<string, unknown> | undefined;
200
251
  }, {
201
252
  role: "tool" | "assistant" | "user" | "system";
202
253
  timestamp?: string | undefined;
203
- name?: string | undefined;
204
254
  content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
205
255
  toolCalls?: {
206
256
  tool: string;
@@ -209,19 +259,62 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
209
259
  id?: string | undefined;
210
260
  timestamp?: string | undefined;
211
261
  }[] | undefined;
262
+ name?: string | undefined;
212
263
  metadata?: Record<string, unknown> | undefined;
213
264
  }>, "many">>>;
214
265
  guidelineFiles: z.ZodArray<z.ZodString, "many">;
215
266
  inputFiles: z.ZodArray<z.ZodString, "many">;
216
267
  inputMessages: z.ZodArray<z.ZodObject<{
217
- role: z.ZodEnum<["system", "user", "assistant", "tool"]>;
218
- content: z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>;
268
+ role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
269
+ content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
270
+ toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
271
+ tool: z.ZodString;
272
+ input: z.ZodOptional<z.ZodUnknown>;
273
+ output: z.ZodOptional<z.ZodUnknown>;
274
+ id: z.ZodOptional<z.ZodString>;
275
+ timestamp: z.ZodOptional<z.ZodString>;
276
+ }, "strip", z.ZodTypeAny, {
277
+ tool: string;
278
+ input?: unknown;
279
+ output?: unknown;
280
+ id?: string | undefined;
281
+ timestamp?: string | undefined;
282
+ }, {
283
+ tool: string;
284
+ input?: unknown;
285
+ output?: unknown;
286
+ id?: string | undefined;
287
+ timestamp?: string | undefined;
288
+ }>, "many">>;
289
+ name: z.ZodOptional<z.ZodString>;
290
+ timestamp: z.ZodOptional<z.ZodString>;
291
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
219
292
  }, "strip", z.ZodTypeAny, {
220
293
  role: "tool" | "assistant" | "user" | "system";
221
- content: string | Record<string, unknown> | Record<string, unknown>[];
294
+ timestamp?: string | undefined;
295
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
296
+ toolCalls?: {
297
+ tool: string;
298
+ input?: unknown;
299
+ output?: unknown;
300
+ id?: string | undefined;
301
+ timestamp?: string | undefined;
302
+ }[] | undefined;
303
+ name?: string | undefined;
304
+ metadata?: Record<string, unknown> | undefined;
222
305
  }, {
223
306
  role: "tool" | "assistant" | "user" | "system";
224
- content: string | Record<string, unknown> | Record<string, unknown>[];
307
+ timestamp?: string | undefined;
308
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
309
+ toolCalls?: {
310
+ tool: string;
311
+ input?: unknown;
312
+ output?: unknown;
313
+ id?: string | undefined;
314
+ timestamp?: string | undefined;
315
+ }[] | undefined;
316
+ name?: string | undefined;
317
+ metadata?: Record<string, unknown> | undefined;
225
318
  }>, "many">;
226
319
  traceSummary: z.ZodOptional<z.ZodNullable<z.ZodObject<{
227
320
  eventCount: z.ZodNumber;
@@ -275,19 +368,41 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
275
368
  }, "strip", z.ZodTypeAny, {
276
369
  question: string;
277
370
  expectedOutcome: string;
278
- expectedMessages: Record<string, unknown>[];
371
+ expectedMessages: {
372
+ role: "tool" | "assistant" | "user" | "system";
373
+ timestamp?: string | undefined;
374
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
375
+ toolCalls?: {
376
+ tool: string;
377
+ input?: unknown;
378
+ output?: unknown;
379
+ id?: string | undefined;
380
+ timestamp?: string | undefined;
381
+ }[] | undefined;
382
+ name?: string | undefined;
383
+ metadata?: Record<string, unknown> | undefined;
384
+ }[];
279
385
  candidateAnswer: string;
280
386
  guidelineFiles: string[];
281
387
  inputFiles: string[];
282
388
  inputMessages: {
283
389
  role: "tool" | "assistant" | "user" | "system";
284
- content: string | Record<string, unknown> | Record<string, unknown>[];
390
+ timestamp?: string | undefined;
391
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
392
+ toolCalls?: {
393
+ tool: string;
394
+ input?: unknown;
395
+ output?: unknown;
396
+ id?: string | undefined;
397
+ timestamp?: string | undefined;
398
+ }[] | undefined;
399
+ name?: string | undefined;
400
+ metadata?: Record<string, unknown> | undefined;
285
401
  }[];
286
402
  referenceAnswer?: string | undefined;
287
403
  outputMessages?: {
288
404
  role: "tool" | "assistant" | "user" | "system";
289
405
  timestamp?: string | undefined;
290
- name?: string | undefined;
291
406
  content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
292
407
  toolCalls?: {
293
408
  tool: string;
@@ -296,6 +411,7 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
296
411
  id?: string | undefined;
297
412
  timestamp?: string | undefined;
298
413
  }[] | undefined;
414
+ name?: string | undefined;
299
415
  metadata?: Record<string, unknown> | undefined;
300
416
  }[] | null | undefined;
301
417
  traceSummary?: {
@@ -316,19 +432,41 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
316
432
  }, {
317
433
  question: string;
318
434
  expectedOutcome: string;
319
- expectedMessages: Record<string, unknown>[];
435
+ expectedMessages: {
436
+ role: "tool" | "assistant" | "user" | "system";
437
+ timestamp?: string | undefined;
438
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
439
+ toolCalls?: {
440
+ tool: string;
441
+ input?: unknown;
442
+ output?: unknown;
443
+ id?: string | undefined;
444
+ timestamp?: string | undefined;
445
+ }[] | undefined;
446
+ name?: string | undefined;
447
+ metadata?: Record<string, unknown> | undefined;
448
+ }[];
320
449
  candidateAnswer: string;
321
450
  guidelineFiles: string[];
322
451
  inputFiles: string[];
323
452
  inputMessages: {
324
453
  role: "tool" | "assistant" | "user" | "system";
325
- content: string | Record<string, unknown> | Record<string, unknown>[];
454
+ timestamp?: string | undefined;
455
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
456
+ toolCalls?: {
457
+ tool: string;
458
+ input?: unknown;
459
+ output?: unknown;
460
+ id?: string | undefined;
461
+ timestamp?: string | undefined;
462
+ }[] | undefined;
463
+ name?: string | undefined;
464
+ metadata?: Record<string, unknown> | undefined;
326
465
  }[];
327
466
  referenceAnswer?: string | undefined;
328
467
  outputMessages?: {
329
468
  role: "tool" | "assistant" | "user" | "system";
330
469
  timestamp?: string | undefined;
331
- name?: string | undefined;
332
470
  content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
333
471
  toolCalls?: {
334
472
  tool: string;
@@ -337,6 +475,7 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
337
475
  id?: string | undefined;
338
476
  timestamp?: string | undefined;
339
477
  }[] | undefined;
478
+ name?: string | undefined;
340
479
  metadata?: Record<string, unknown> | undefined;
341
480
  }[] | null | undefined;
342
481
  traceSummary?: {
@@ -363,16 +502,20 @@ declare const CodeJudgeResultSchema: z.ZodObject<{
363
502
  hits: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
364
503
  misses: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
365
504
  reasoning: z.ZodOptional<z.ZodString>;
505
+ /** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */
506
+ details: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
366
507
  }, "strip", z.ZodTypeAny, {
367
508
  score: number;
368
509
  hits: string[];
369
510
  misses: string[];
370
511
  reasoning?: string | undefined;
512
+ details?: Record<string, unknown> | undefined;
371
513
  }, {
372
514
  score: number;
373
515
  hits?: string[] | undefined;
374
516
  misses?: string[] | undefined;
375
517
  reasoning?: string | undefined;
518
+ details?: Record<string, unknown> | undefined;
376
519
  }>;
377
520
  /**
378
521
  * Inferred types from schemas.
@@ -380,10 +523,118 @@ declare const CodeJudgeResultSchema: z.ZodObject<{
380
523
  type CodeJudgeInput = z.infer<typeof CodeJudgeInputSchema>;
381
524
  type CodeJudgeResult = z.infer<typeof CodeJudgeResultSchema>;
382
525
  type TraceSummary = z.infer<typeof TraceSummarySchema>;
383
- type OutputMessage = z.infer<typeof OutputMessageSchema>;
526
+ type Message = z.infer<typeof MessageSchema>;
384
527
  type ToolCall = z.infer<typeof ToolCallSchema>;
385
528
  type TokenUsage = z.infer<typeof TokenUsageSchema>;
386
529
 
530
+ /**
531
+ * Client for invoking configured targets from code_judge scripts.
532
+ *
533
+ * Environment variables (set automatically by AgentV when `target` config is present):
534
+ * - AGENTV_TARGET_PROXY_URL: The URL of the local proxy server
535
+ * - AGENTV_TARGET_PROXY_TOKEN: Bearer token for authentication
536
+ */
537
+ /**
538
+ * Request to invoke the target
539
+ */
540
+ interface TargetInvokeRequest {
541
+ readonly question: string;
542
+ readonly systemPrompt?: string;
543
+ readonly evalCaseId?: string;
544
+ readonly attempt?: number;
545
+ /** Optional target override - use a different target for this invocation */
546
+ readonly target?: string;
547
+ }
548
+ /**
549
+ * Response from a target invocation
550
+ */
551
+ interface TargetInvokeResponse {
552
+ readonly outputMessages: readonly unknown[];
553
+ readonly rawText?: string;
554
+ }
555
+ /**
556
+ * Information about the target proxy configuration
557
+ */
558
+ interface TargetInfo {
559
+ /** Name of the default target being used */
560
+ readonly targetName: string;
561
+ /** Maximum number of calls allowed */
562
+ readonly maxCalls: number;
563
+ /** Current number of calls made */
564
+ readonly callCount: number;
565
+ /** List of all available target names */
566
+ readonly availableTargets: readonly string[];
567
+ }
568
+ /**
569
+ * Target client for making target invocations
570
+ */
571
+ interface TargetClient {
572
+ /**
573
+ * Invoke the configured target with a prompt.
574
+ * @param request - The question and optional system prompt
575
+ * @returns The target's response with output messages and optional raw text
576
+ */
577
+ invoke(request: TargetInvokeRequest): Promise<TargetInvokeResponse>;
578
+ /**
579
+ * Invoke the target with multiple requests in sequence.
580
+ * Each request counts toward the max_calls limit.
581
+ * @param requests - Array of target requests
582
+ * @returns Array of target responses
583
+ */
584
+ invokeBatch(requests: readonly TargetInvokeRequest[]): Promise<readonly TargetInvokeResponse[]>;
585
+ /**
586
+ * Get information about the target proxy configuration.
587
+ * Returns the default target name, max calls, current call count, and available targets.
588
+ */
589
+ getInfo(): Promise<TargetInfo>;
590
+ }
591
+ /**
592
+ * Error thrown when target proxy is not available
593
+ */
594
+ declare class TargetNotAvailableError extends Error {
595
+ constructor(message: string);
596
+ }
597
+ /**
598
+ * Error thrown when target invocation fails
599
+ */
600
+ declare class TargetInvocationError extends Error {
601
+ readonly statusCode?: number;
602
+ constructor(message: string, statusCode?: number);
603
+ }
604
+ /**
605
+ * Create a target client from environment variables.
606
+ *
607
+ * This function reads the proxy URL and token from environment variables
608
+ * that are automatically set by AgentV when a `target` config block is present
609
+ * on a `code_judge` evaluator.
610
+ *
611
+ * @returns A target client if environment variables are set, otherwise undefined
612
+ * @throws TargetNotAvailableError if token is missing when URL is present
613
+ *
614
+ * @example
615
+ * ```typescript
616
+ * import { createTargetClient, defineCodeJudge } from '@agentv/eval';
617
+ *
618
+ * export default defineCodeJudge(async ({ question, expectedOutcome }) => {
619
+ * const target = createTargetClient();
620
+ *
621
+ * if (!target) {
622
+ * // Target not available - no target config on this evaluator
623
+ * return { score: 0.5, reasoning: 'Target not available' };
624
+ * }
625
+ *
626
+ * const response = await target.invoke({
627
+ * question: `Is this answer correct? Question: ${question}, Expected: ${expectedOutcome}`,
628
+ * systemPrompt: 'You are an expert evaluator. Respond with JSON: { "correct": true/false }'
629
+ * });
630
+ *
631
+ * const result = JSON.parse(response.rawText ?? '{}');
632
+ * return { score: result.correct ? 1.0 : 0.0 };
633
+ * });
634
+ * ```
635
+ */
636
+ declare function createTargetClient(): TargetClient | undefined;
637
+
387
638
  /**
388
639
  * Handler function type for code judges.
389
640
  */
@@ -394,7 +645,7 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
394
645
  *
395
646
  * Build custom code judges for evaluating AI agent outputs.
396
647
  *
397
- * @example
648
+ * @example Basic code judge
398
649
  * ```typescript
399
650
  * #!/usr/bin/env bun
400
651
  * import { defineCodeJudge } from '@agentv/eval';
@@ -406,6 +657,27 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
406
657
  * }));
407
658
  * ```
408
659
  *
660
+ * @example Code judge with target access (requires `target` config in YAML)
661
+ * ```typescript
662
+ * #!/usr/bin/env bun
663
+ * import { defineCodeJudge, createTargetClient } from '@agentv/eval';
664
+ *
665
+ * export default defineCodeJudge(async ({ question }) => {
666
+ * const target = createTargetClient();
667
+ * if (!target) {
668
+ * return { score: 0, misses: ['Target not available'] };
669
+ * }
670
+ *
671
+ * const response = await target.invoke({
672
+ * question: `Evaluate: ${question}`,
673
+ * systemPrompt: 'Respond with JSON: { "score": 0-1 }'
674
+ * });
675
+ *
676
+ * const result = JSON.parse(response.rawText ?? '{}');
677
+ * return { score: result.score ?? 0 };
678
+ * });
679
+ * ```
680
+ *
409
681
  * @packageDocumentation
410
682
  */
411
683
 
@@ -455,4 +727,4 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
455
727
  */
456
728
  declare function defineCodeJudge(handler: CodeJudgeHandler): void;
457
729
 
458
- export { type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type OutputMessage, OutputMessageSchema, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, defineCodeJudge };
730
+ export { type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type Message, MessageSchema, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, createTargetClient, defineCodeJudge };
package/dist/index.js CHANGED
@@ -22,29 +22,24 @@ var ToolCallSchema = z.object({
22
22
  id: z.string().optional(),
23
23
  timestamp: z.string().optional()
24
24
  });
25
- var OutputMessageSchema = z.object({
25
+ var MessageSchema = z.object({
26
26
  role: z.enum(["assistant", "user", "system", "tool"]),
27
- // Optional message name (e.g., agent name) used by some providers for multi-agent transcripts.
28
- name: z.string().optional(),
29
27
  content: z.union([z.string(), z.record(z.unknown()), z.array(z.record(z.unknown()))]).optional(),
30
28
  toolCalls: z.array(ToolCallSchema).optional(),
29
+ name: z.string().optional(),
31
30
  timestamp: z.string().optional(),
32
31
  metadata: z.record(z.unknown()).optional()
33
32
  });
34
- var TestMessageSchema = z.object({
35
- role: z.enum(["system", "user", "assistant", "tool"]),
36
- content: z.union([z.string(), z.record(z.unknown()), z.array(z.record(z.unknown()))])
37
- });
38
33
  var CodeJudgeInputSchema = z.object({
39
34
  question: z.string(),
40
35
  expectedOutcome: z.string(),
41
- expectedMessages: z.array(z.record(z.unknown())),
36
+ expectedMessages: z.array(MessageSchema),
42
37
  referenceAnswer: z.string().optional(),
43
38
  candidateAnswer: z.string(),
44
- outputMessages: z.array(OutputMessageSchema).nullable().optional(),
39
+ outputMessages: z.array(MessageSchema).nullable().optional(),
45
40
  guidelineFiles: z.array(z.string()),
46
41
  inputFiles: z.array(z.string()),
47
- inputMessages: z.array(TestMessageSchema),
42
+ inputMessages: z.array(MessageSchema),
48
43
  traceSummary: TraceSummarySchema.nullable().optional(),
49
44
  config: z.record(z.unknown()).nullable().optional()
50
45
  });
@@ -52,9 +47,119 @@ var CodeJudgeResultSchema = z.object({
52
47
  score: z.number().min(0).max(1),
53
48
  hits: z.array(z.string()).optional().default([]),
54
49
  misses: z.array(z.string()).optional().default([]),
55
- reasoning: z.string().optional()
50
+ reasoning: z.string().optional(),
51
+ /** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */
52
+ details: z.record(z.unknown()).optional()
56
53
  });
57
54
 
55
+ // src/target-client.ts
56
+ var TargetNotAvailableError = class extends Error {
57
+ constructor(message) {
58
+ super(message);
59
+ this.name = "TargetNotAvailableError";
60
+ }
61
+ };
62
+ var TargetInvocationError = class extends Error {
63
+ statusCode;
64
+ constructor(message, statusCode) {
65
+ super(message);
66
+ this.name = "TargetInvocationError";
67
+ this.statusCode = statusCode;
68
+ }
69
+ };
70
+ function createTargetClient() {
71
+ const proxyUrl = process.env.AGENTV_TARGET_PROXY_URL;
72
+ const proxyToken = process.env.AGENTV_TARGET_PROXY_TOKEN;
73
+ if (!proxyUrl) {
74
+ return void 0;
75
+ }
76
+ if (!proxyToken) {
77
+ throw new TargetNotAvailableError(
78
+ "AGENTV_TARGET_PROXY_URL is set but AGENTV_TARGET_PROXY_TOKEN is missing"
79
+ );
80
+ }
81
+ return createTargetClientInternal(proxyUrl, proxyToken);
82
+ }
83
+ function createTargetClientInternal(url, token) {
84
+ const headers = {
85
+ "Content-Type": "application/json",
86
+ Authorization: `Bearer ${token}`
87
+ };
88
+ return {
89
+ async invoke(request) {
90
+ const response = await fetch(`${url}/invoke`, {
91
+ method: "POST",
92
+ headers,
93
+ body: JSON.stringify({
94
+ question: request.question,
95
+ systemPrompt: request.systemPrompt,
96
+ evalCaseId: request.evalCaseId,
97
+ attempt: request.attempt,
98
+ target: request.target
99
+ })
100
+ });
101
+ if (!response.ok) {
102
+ const errorBody = await response.text();
103
+ let errorMessage;
104
+ try {
105
+ const errorJson = JSON.parse(errorBody);
106
+ errorMessage = errorJson.error ?? `HTTP ${response.status}`;
107
+ } catch {
108
+ errorMessage = errorBody || `HTTP ${response.status}`;
109
+ }
110
+ throw new TargetInvocationError(errorMessage, response.status);
111
+ }
112
+ return await response.json();
113
+ },
114
+ async invokeBatch(requests) {
115
+ const response = await fetch(`${url}/invokeBatch`, {
116
+ method: "POST",
117
+ headers,
118
+ body: JSON.stringify({
119
+ requests: requests.map((r) => ({
120
+ question: r.question,
121
+ systemPrompt: r.systemPrompt,
122
+ evalCaseId: r.evalCaseId,
123
+ attempt: r.attempt,
124
+ target: r.target
125
+ }))
126
+ })
127
+ });
128
+ if (!response.ok) {
129
+ const errorBody = await response.text();
130
+ let errorMessage;
131
+ try {
132
+ const errorJson = JSON.parse(errorBody);
133
+ errorMessage = errorJson.error ?? `HTTP ${response.status}`;
134
+ } catch {
135
+ errorMessage = errorBody || `HTTP ${response.status}`;
136
+ }
137
+ throw new TargetInvocationError(errorMessage, response.status);
138
+ }
139
+ const result = await response.json();
140
+ return result.responses;
141
+ },
142
+ async getInfo() {
143
+ const response = await fetch(`${url}/info`, {
144
+ method: "GET",
145
+ headers
146
+ });
147
+ if (!response.ok) {
148
+ const errorBody = await response.text();
149
+ let errorMessage;
150
+ try {
151
+ const errorJson = JSON.parse(errorBody);
152
+ errorMessage = errorJson.error ?? `HTTP ${response.status}`;
153
+ } catch {
154
+ errorMessage = errorBody || `HTTP ${response.status}`;
155
+ }
156
+ throw new TargetInvocationError(errorMessage, response.status);
157
+ }
158
+ return await response.json();
159
+ }
160
+ };
161
+ }
162
+
58
163
  // src/index.ts
59
164
  import { z as z2 } from "zod";
60
165
 
@@ -134,10 +239,13 @@ function defineCodeJudge(handler) {
134
239
  export {
135
240
  CodeJudgeInputSchema,
136
241
  CodeJudgeResultSchema,
137
- OutputMessageSchema,
242
+ MessageSchema,
243
+ TargetInvocationError,
244
+ TargetNotAvailableError,
138
245
  TokenUsageSchema,
139
246
  ToolCallSchema,
140
247
  TraceSummarySchema,
248
+ createTargetClient,
141
249
  defineCodeJudge,
142
250
  z2 as z
143
251
  };