@agentv/core 1.3.1 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-4A6L2F6L.js → chunk-E2VSU4WZ.js} +282 -81
- package/dist/chunk-E2VSU4WZ.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +82 -67
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +3 -68
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1668 -489
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +279 -77
- package/dist/index.d.ts +279 -77
- package/dist/index.js +1334 -356
- package/dist/index.js.map +1 -1
- package/package.json +2 -5
- package/dist/chunk-4A6L2F6L.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
1
2
|
import * as ai from 'ai';
|
|
2
3
|
|
|
3
4
|
/**
|
|
@@ -5,30 +6,15 @@ import * as ai from 'ai';
|
|
|
5
6
|
* Provides a normalized, provider-agnostic model for tool-call trajectories.
|
|
6
7
|
*/
|
|
7
8
|
/**
|
|
8
|
-
*
|
|
9
|
+
* Token usage metrics from provider execution.
|
|
9
10
|
*/
|
|
10
|
-
|
|
11
|
-
/**
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
readonly type: TraceEventType;
|
|
18
|
-
/** ISO 8601 timestamp */
|
|
19
|
-
readonly timestamp: string;
|
|
20
|
-
/** Stable identifier for pairing tool_call/tool_result */
|
|
21
|
-
readonly id?: string;
|
|
22
|
-
/** Tool name (for tool_call/tool_result) */
|
|
23
|
-
readonly name?: string;
|
|
24
|
-
/** Tool input - any JSON value */
|
|
25
|
-
readonly input?: unknown;
|
|
26
|
-
/** Tool output - any JSON value */
|
|
27
|
-
readonly output?: unknown;
|
|
28
|
-
/** Message content (for message/model_step) */
|
|
29
|
-
readonly text?: string;
|
|
30
|
-
/** Provider-specific metadata */
|
|
31
|
-
readonly metadata?: Record<string, unknown>;
|
|
11
|
+
interface TokenUsage {
|
|
12
|
+
/** Input/prompt tokens consumed */
|
|
13
|
+
readonly input: number;
|
|
14
|
+
/** Output/completion tokens generated */
|
|
15
|
+
readonly output: number;
|
|
16
|
+
/** Cached tokens (optional, provider-specific) */
|
|
17
|
+
readonly cached?: number;
|
|
32
18
|
}
|
|
33
19
|
/**
|
|
34
20
|
* Compact summary of a trace for lightweight persistence.
|
|
@@ -43,6 +29,14 @@ interface TraceSummary {
|
|
|
43
29
|
readonly toolCallsByName: Readonly<Record<string, number>>;
|
|
44
30
|
/** Number of error events */
|
|
45
31
|
readonly errorCount: number;
|
|
32
|
+
/** Token usage metrics (optional, from provider) */
|
|
33
|
+
readonly tokenUsage?: TokenUsage;
|
|
34
|
+
/** Total cost in USD (optional, from provider) */
|
|
35
|
+
readonly costUsd?: number;
|
|
36
|
+
/** Total execution duration in milliseconds (optional) */
|
|
37
|
+
readonly durationMs?: number;
|
|
38
|
+
/** Per-tool duration arrays in milliseconds (optional) */
|
|
39
|
+
readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
|
|
46
40
|
}
|
|
47
41
|
/**
|
|
48
42
|
* Configuration for tool_trajectory evaluator.
|
|
@@ -64,20 +58,70 @@ interface ToolTrajectoryEvaluatorConfig {
|
|
|
64
58
|
*/
|
|
65
59
|
interface ToolTrajectoryExpectedItem {
|
|
66
60
|
readonly tool: string;
|
|
61
|
+
/** Optional argument matching: 'any' skips validation, object performs partial deep equality */
|
|
62
|
+
readonly args?: 'any' | Record<string, unknown>;
|
|
67
63
|
}
|
|
68
64
|
/**
|
|
69
|
-
*
|
|
65
|
+
* Simplified input type for computeTraceSummary.
|
|
66
|
+
* Matches OutputMessage structure without requiring full provider/types import.
|
|
70
67
|
*/
|
|
71
|
-
|
|
68
|
+
interface OutputMessageLike {
|
|
69
|
+
readonly toolCalls?: readonly {
|
|
70
|
+
readonly tool: string;
|
|
71
|
+
}[];
|
|
72
|
+
}
|
|
72
73
|
/**
|
|
73
|
-
*
|
|
74
|
+
* Compute a lightweight summary from output messages.
|
|
75
|
+
* Used for default result persistence without payload bloat.
|
|
74
76
|
*/
|
|
75
|
-
declare function
|
|
77
|
+
declare function computeTraceSummary(messages: readonly OutputMessageLike[]): TraceSummary;
|
|
76
78
|
/**
|
|
77
|
-
*
|
|
78
|
-
*
|
|
79
|
+
* Default tool names considered as exploration/read-only operations.
|
|
80
|
+
* Can be overridden per-evaluation via config.
|
|
81
|
+
*/
|
|
82
|
+
declare const DEFAULT_EXPLORATION_TOOLS: readonly ["read", "grep", "glob", "search", "list", "Read", "Grep", "Glob", "WebSearch", "WebFetch"];
|
|
83
|
+
/**
|
|
84
|
+
* Ratio of exploration tool calls to total tool calls.
|
|
85
|
+
* Returns undefined if there are no tool calls.
|
|
86
|
+
*
|
|
87
|
+
* @param summary - Trace summary with tool call counts
|
|
88
|
+
* @param explorationTools - Tool names considered exploration (defaults to DEFAULT_EXPLORATION_TOOLS)
|
|
89
|
+
* @returns Ratio between 0 and 1, or undefined if no tool calls
|
|
90
|
+
*/
|
|
91
|
+
declare function explorationRatio(summary: TraceSummary, explorationTools?: readonly string[]): number | undefined;
|
|
92
|
+
/**
|
|
93
|
+
* Average tokens consumed per tool call.
|
|
94
|
+
* Returns undefined if tokenUsage is not available or no tool calls.
|
|
95
|
+
*
|
|
96
|
+
* @param summary - Trace summary with optional token usage
|
|
97
|
+
* @returns Average tokens per tool call, or undefined
|
|
98
|
+
*/
|
|
99
|
+
declare function tokensPerTool(summary: TraceSummary): number | undefined;
|
|
100
|
+
/**
|
|
101
|
+
* Average tool duration across all tool calls.
|
|
102
|
+
* Returns undefined if toolDurations is not available or empty.
|
|
103
|
+
*
|
|
104
|
+
* @param summary - Trace summary with optional tool durations
|
|
105
|
+
* @returns Average duration in milliseconds, or undefined
|
|
106
|
+
*/
|
|
107
|
+
declare function avgToolDurationMs(summary: TraceSummary): number | undefined;
|
|
108
|
+
/**
|
|
109
|
+
* Execution metrics from provider response.
|
|
110
|
+
*/
|
|
111
|
+
interface ExecutionMetrics {
|
|
112
|
+
readonly tokenUsage?: TokenUsage;
|
|
113
|
+
readonly costUsd?: number;
|
|
114
|
+
readonly durationMs?: number;
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Merge execution metrics from provider response into a trace summary.
|
|
118
|
+
* Returns a new TraceSummary with metrics fields populated.
|
|
119
|
+
*
|
|
120
|
+
* @param summary - Base trace summary from computeTraceSummary
|
|
121
|
+
* @param metrics - Optional execution metrics from provider
|
|
122
|
+
* @returns TraceSummary with merged metrics
|
|
79
123
|
*/
|
|
80
|
-
declare function
|
|
124
|
+
declare function mergeExecutionMetrics(summary: TraceSummary, metrics?: ExecutionMetrics): TraceSummary;
|
|
81
125
|
|
|
82
126
|
/**
|
|
83
127
|
* JSON primitive values appearing in AgentV payloads.
|
|
@@ -229,25 +273,23 @@ interface EvalCase {
|
|
|
229
273
|
*/
|
|
230
274
|
interface EvaluationResult {
|
|
231
275
|
readonly timestamp: string;
|
|
232
|
-
readonly
|
|
276
|
+
readonly evalId: string;
|
|
233
277
|
readonly dataset?: string;
|
|
234
|
-
readonly
|
|
278
|
+
readonly conversationId?: string;
|
|
235
279
|
readonly score: number;
|
|
236
280
|
readonly hits: readonly string[];
|
|
237
281
|
readonly misses: readonly string[];
|
|
238
|
-
readonly
|
|
282
|
+
readonly candidateAnswer: string;
|
|
239
283
|
readonly target: string;
|
|
240
284
|
readonly reasoning?: string;
|
|
241
|
-
readonly
|
|
242
|
-
readonly
|
|
243
|
-
readonly
|
|
244
|
-
readonly
|
|
245
|
-
readonly
|
|
285
|
+
readonly rawAspects?: readonly string[];
|
|
286
|
+
readonly agentProviderRequest?: JsonObject;
|
|
287
|
+
readonly lmProviderRequest?: JsonObject;
|
|
288
|
+
readonly evaluatorProviderRequest?: JsonObject;
|
|
289
|
+
readonly evaluatorResults?: readonly EvaluatorResult[];
|
|
246
290
|
readonly error?: string;
|
|
247
291
|
/** Lightweight summary of the execution trace (always included when available) */
|
|
248
|
-
readonly
|
|
249
|
-
/** Full trace events (only included when --include-trace flag is set) */
|
|
250
|
-
readonly trace?: readonly TraceEvent[];
|
|
292
|
+
readonly traceSummary?: TraceSummary;
|
|
251
293
|
}
|
|
252
294
|
type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
|
|
253
295
|
interface EvaluatorResult {
|
|
@@ -259,9 +301,9 @@ interface EvaluatorResult {
|
|
|
259
301
|
readonly hits: readonly string[];
|
|
260
302
|
readonly misses: readonly string[];
|
|
261
303
|
readonly reasoning?: string;
|
|
262
|
-
readonly
|
|
263
|
-
readonly
|
|
264
|
-
readonly
|
|
304
|
+
readonly rawRequest?: JsonObject;
|
|
305
|
+
readonly evaluatorProviderRequest?: JsonObject;
|
|
306
|
+
readonly evaluatorResults?: readonly EvaluatorResult[];
|
|
265
307
|
}
|
|
266
308
|
/**
|
|
267
309
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -275,7 +317,7 @@ interface ChatMessage {
|
|
|
275
317
|
readonly name?: string;
|
|
276
318
|
}
|
|
277
319
|
type ChatPrompt = readonly ChatMessage[];
|
|
278
|
-
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
320
|
+
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
279
321
|
interface ProviderRequest {
|
|
280
322
|
readonly question: string;
|
|
281
323
|
readonly systemPrompt?: string;
|
|
@@ -290,15 +332,62 @@ interface ProviderRequest {
|
|
|
290
332
|
readonly metadata?: JsonObject;
|
|
291
333
|
readonly signal?: AbortSignal;
|
|
292
334
|
}
|
|
335
|
+
/**
|
|
336
|
+
* A tool call within an output message.
|
|
337
|
+
* Represents a single tool invocation with its input and optional output.
|
|
338
|
+
*/
|
|
339
|
+
interface ToolCall {
|
|
340
|
+
/** Tool name */
|
|
341
|
+
readonly tool: string;
|
|
342
|
+
/** Tool input arguments */
|
|
343
|
+
readonly input?: unknown;
|
|
344
|
+
/** Tool output result */
|
|
345
|
+
readonly output?: unknown;
|
|
346
|
+
/** Stable identifier for pairing tool calls */
|
|
347
|
+
readonly id?: string;
|
|
348
|
+
/** ISO 8601 timestamp */
|
|
349
|
+
readonly timestamp?: string;
|
|
350
|
+
}
|
|
351
|
+
/**
|
|
352
|
+
* An output message from agent execution.
|
|
353
|
+
* Represents a single message in the conversation with optional tool calls.
|
|
354
|
+
*/
|
|
355
|
+
interface OutputMessage {
|
|
356
|
+
/** Message role (e.g., 'assistant', 'user', 'tool') */
|
|
357
|
+
readonly role: string;
|
|
358
|
+
/** Optional name for the message sender */
|
|
359
|
+
readonly name?: string;
|
|
360
|
+
/** Message content */
|
|
361
|
+
readonly content?: unknown;
|
|
362
|
+
/** Tool calls made in this message */
|
|
363
|
+
readonly toolCalls?: readonly ToolCall[];
|
|
364
|
+
/** ISO 8601 timestamp */
|
|
365
|
+
readonly timestamp?: string;
|
|
366
|
+
/** Provider-specific metadata */
|
|
367
|
+
readonly metadata?: Record<string, unknown>;
|
|
368
|
+
}
|
|
369
|
+
/**
|
|
370
|
+
* Token usage metrics reported by provider.
|
|
371
|
+
*/
|
|
372
|
+
interface ProviderTokenUsage {
|
|
373
|
+
/** Input/prompt tokens consumed */
|
|
374
|
+
readonly input: number;
|
|
375
|
+
/** Output/completion tokens generated */
|
|
376
|
+
readonly output: number;
|
|
377
|
+
/** Cached tokens (optional, provider-specific) */
|
|
378
|
+
readonly cached?: number;
|
|
379
|
+
}
|
|
293
380
|
interface ProviderResponse {
|
|
294
|
-
readonly text: string;
|
|
295
|
-
readonly reasoning?: string;
|
|
296
381
|
readonly raw?: unknown;
|
|
297
382
|
readonly usage?: JsonObject;
|
|
298
|
-
/**
|
|
299
|
-
readonly
|
|
300
|
-
/**
|
|
301
|
-
readonly
|
|
383
|
+
/** Output messages from agent execution (primary source for tool trajectory) */
|
|
384
|
+
readonly outputMessages?: readonly OutputMessage[];
|
|
385
|
+
/** Token usage metrics (optional) */
|
|
386
|
+
readonly tokenUsage?: ProviderTokenUsage;
|
|
387
|
+
/** Total cost in USD (optional) */
|
|
388
|
+
readonly costUsd?: number;
|
|
389
|
+
/** Execution duration in milliseconds (optional) */
|
|
390
|
+
readonly durationMs?: number;
|
|
302
391
|
}
|
|
303
392
|
interface Provider {
|
|
304
393
|
readonly id: string;
|
|
@@ -479,6 +568,101 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
|
|
|
479
568
|
readonly attempted: readonly string[];
|
|
480
569
|
}>;
|
|
481
570
|
|
|
571
|
+
/**
|
|
572
|
+
* Strict normalized schema for CLI target configuration.
|
|
573
|
+
* This is the final validated shape after environment variable resolution
|
|
574
|
+
* and snake_case to camelCase normalization.
|
|
575
|
+
*
|
|
576
|
+
* Uses .strict() to reject unknown properties, ensuring configuration
|
|
577
|
+
* errors are caught early rather than silently ignored.
|
|
578
|
+
*
|
|
579
|
+
* @example
|
|
580
|
+
* ```typescript
|
|
581
|
+
* const config: CliNormalizedConfig = {
|
|
582
|
+
* commandTemplate: 'agent run {PROMPT}',
|
|
583
|
+
* timeoutMs: 120000,
|
|
584
|
+
* verbose: true,
|
|
585
|
+
* };
|
|
586
|
+
* CliTargetConfigSchema.parse(config); // Validates the normalized config
|
|
587
|
+
* ```
|
|
588
|
+
*/
|
|
589
|
+
declare const CliTargetConfigSchema: z.ZodObject<{
|
|
590
|
+
commandTemplate: z.ZodString;
|
|
591
|
+
filesFormat: z.ZodOptional<z.ZodString>;
|
|
592
|
+
cwd: z.ZodOptional<z.ZodString>;
|
|
593
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
594
|
+
healthcheck: z.ZodOptional<z.ZodDiscriminatedUnion<"type", [z.ZodObject<{
|
|
595
|
+
type: z.ZodLiteral<"http">;
|
|
596
|
+
url: z.ZodString;
|
|
597
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
598
|
+
}, "strict", z.ZodTypeAny, {
|
|
599
|
+
type: "http";
|
|
600
|
+
url: string;
|
|
601
|
+
timeoutMs?: number | undefined;
|
|
602
|
+
}, {
|
|
603
|
+
type: "http";
|
|
604
|
+
url: string;
|
|
605
|
+
timeoutMs?: number | undefined;
|
|
606
|
+
}>, z.ZodObject<{
|
|
607
|
+
type: z.ZodLiteral<"command">;
|
|
608
|
+
commandTemplate: z.ZodString;
|
|
609
|
+
cwd: z.ZodOptional<z.ZodString>;
|
|
610
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
611
|
+
}, "strict", z.ZodTypeAny, {
|
|
612
|
+
type: "command";
|
|
613
|
+
commandTemplate: string;
|
|
614
|
+
cwd?: string | undefined;
|
|
615
|
+
timeoutMs?: number | undefined;
|
|
616
|
+
}, {
|
|
617
|
+
type: "command";
|
|
618
|
+
commandTemplate: string;
|
|
619
|
+
cwd?: string | undefined;
|
|
620
|
+
timeoutMs?: number | undefined;
|
|
621
|
+
}>]>>;
|
|
622
|
+
verbose: z.ZodOptional<z.ZodBoolean>;
|
|
623
|
+
keepTempFiles: z.ZodOptional<z.ZodBoolean>;
|
|
624
|
+
}, "strict", z.ZodTypeAny, {
|
|
625
|
+
commandTemplate: string;
|
|
626
|
+
cwd?: string | undefined;
|
|
627
|
+
verbose?: boolean | undefined;
|
|
628
|
+
filesFormat?: string | undefined;
|
|
629
|
+
healthcheck?: {
|
|
630
|
+
type: "http";
|
|
631
|
+
url: string;
|
|
632
|
+
timeoutMs?: number | undefined;
|
|
633
|
+
} | {
|
|
634
|
+
type: "command";
|
|
635
|
+
commandTemplate: string;
|
|
636
|
+
cwd?: string | undefined;
|
|
637
|
+
timeoutMs?: number | undefined;
|
|
638
|
+
} | undefined;
|
|
639
|
+
keepTempFiles?: boolean | undefined;
|
|
640
|
+
timeoutMs?: number | undefined;
|
|
641
|
+
}, {
|
|
642
|
+
commandTemplate: string;
|
|
643
|
+
cwd?: string | undefined;
|
|
644
|
+
verbose?: boolean | undefined;
|
|
645
|
+
filesFormat?: string | undefined;
|
|
646
|
+
healthcheck?: {
|
|
647
|
+
type: "http";
|
|
648
|
+
url: string;
|
|
649
|
+
timeoutMs?: number | undefined;
|
|
650
|
+
} | {
|
|
651
|
+
type: "command";
|
|
652
|
+
commandTemplate: string;
|
|
653
|
+
cwd?: string | undefined;
|
|
654
|
+
timeoutMs?: number | undefined;
|
|
655
|
+
} | undefined;
|
|
656
|
+
keepTempFiles?: boolean | undefined;
|
|
657
|
+
timeoutMs?: number | undefined;
|
|
658
|
+
}>;
|
|
659
|
+
type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
|
|
660
|
+
/**
|
|
661
|
+
* Resolved CLI configuration type derived from CliTargetConfigSchema.
|
|
662
|
+
* This is the final validated shape used by the CLI provider at runtime.
|
|
663
|
+
* Using Readonly to ensure immutability for runtime safety.
|
|
664
|
+
*/
|
|
665
|
+
type CliResolvedConfig = Readonly<CliNormalizedConfig>;
|
|
482
666
|
interface RetryConfig {
|
|
483
667
|
readonly maxRetries?: number;
|
|
484
668
|
readonly initialDelayMs?: number;
|
|
@@ -526,14 +710,27 @@ interface CodexResolvedConfig {
|
|
|
526
710
|
readonly timeoutMs?: number;
|
|
527
711
|
readonly logDir?: string;
|
|
528
712
|
readonly logFormat?: 'summary' | 'json';
|
|
713
|
+
readonly systemPrompt?: string;
|
|
714
|
+
}
|
|
715
|
+
interface PiCodingAgentResolvedConfig {
|
|
716
|
+
readonly executable: string;
|
|
717
|
+
readonly provider?: string;
|
|
718
|
+
readonly model?: string;
|
|
719
|
+
readonly apiKey?: string;
|
|
720
|
+
readonly tools?: string;
|
|
721
|
+
readonly thinking?: string;
|
|
722
|
+
readonly args?: readonly string[];
|
|
723
|
+
readonly cwd?: string;
|
|
724
|
+
readonly timeoutMs?: number;
|
|
725
|
+
readonly logDir?: string;
|
|
726
|
+
readonly logFormat?: 'summary' | 'json';
|
|
727
|
+
readonly systemPrompt?: string;
|
|
529
728
|
}
|
|
530
729
|
interface MockResolvedConfig {
|
|
531
730
|
readonly response?: string;
|
|
532
731
|
readonly delayMs?: number;
|
|
533
732
|
readonly delayMinMs?: number;
|
|
534
733
|
readonly delayMaxMs?: number;
|
|
535
|
-
/** Mock trace events for testing tool_trajectory evaluator */
|
|
536
|
-
readonly trace?: readonly TraceEvent[];
|
|
537
734
|
}
|
|
538
735
|
interface VSCodeResolvedConfig {
|
|
539
736
|
readonly command: string;
|
|
@@ -542,24 +739,6 @@ interface VSCodeResolvedConfig {
|
|
|
542
739
|
readonly subagentRoot?: string;
|
|
543
740
|
readonly workspaceTemplate?: string;
|
|
544
741
|
}
|
|
545
|
-
type CliHealthcheck = {
|
|
546
|
-
readonly type: 'http';
|
|
547
|
-
readonly url: string;
|
|
548
|
-
readonly timeoutMs?: number;
|
|
549
|
-
} | {
|
|
550
|
-
readonly type: 'command';
|
|
551
|
-
readonly commandTemplate: string;
|
|
552
|
-
readonly timeoutMs?: number;
|
|
553
|
-
readonly cwd?: string;
|
|
554
|
-
};
|
|
555
|
-
interface CliResolvedConfig {
|
|
556
|
-
readonly commandTemplate: string;
|
|
557
|
-
readonly filesFormat?: string;
|
|
558
|
-
readonly cwd?: string;
|
|
559
|
-
readonly timeoutMs?: number;
|
|
560
|
-
readonly healthcheck?: CliHealthcheck;
|
|
561
|
-
readonly verbose?: boolean;
|
|
562
|
-
}
|
|
563
742
|
type ResolvedTarget = {
|
|
564
743
|
readonly kind: 'azure';
|
|
565
744
|
readonly name: string;
|
|
@@ -588,6 +767,13 @@ type ResolvedTarget = {
|
|
|
588
767
|
readonly workers?: number;
|
|
589
768
|
readonly providerBatching?: boolean;
|
|
590
769
|
readonly config: CodexResolvedConfig;
|
|
770
|
+
} | {
|
|
771
|
+
readonly kind: 'pi-coding-agent';
|
|
772
|
+
readonly name: string;
|
|
773
|
+
readonly judgeTarget?: string;
|
|
774
|
+
readonly workers?: number;
|
|
775
|
+
readonly providerBatching?: boolean;
|
|
776
|
+
readonly config: PiCodingAgentResolvedConfig;
|
|
591
777
|
} | {
|
|
592
778
|
readonly kind: 'mock';
|
|
593
779
|
readonly name: string;
|
|
@@ -643,6 +829,16 @@ type CodexLogListener = (entry: CodexLogEntry) => void;
|
|
|
643
829
|
declare function consumeCodexLogEntries(): CodexLogEntry[];
|
|
644
830
|
declare function subscribeToCodexLogEntries(listener: CodexLogListener): () => void;
|
|
645
831
|
|
|
832
|
+
type PiLogEntry = {
|
|
833
|
+
readonly filePath: string;
|
|
834
|
+
readonly evalCaseId?: string;
|
|
835
|
+
readonly targetName: string;
|
|
836
|
+
readonly attempt?: number;
|
|
837
|
+
};
|
|
838
|
+
type PiLogListener = (entry: PiLogEntry) => void;
|
|
839
|
+
declare function consumePiLogEntries(): PiLogEntry[];
|
|
840
|
+
declare function subscribeToPiLogEntries(listener: PiLogListener): () => void;
|
|
841
|
+
|
|
646
842
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
647
843
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
648
844
|
|
|
@@ -662,12 +858,10 @@ interface EvaluationContext {
|
|
|
662
858
|
readonly judgeProvider?: Provider;
|
|
663
859
|
readonly evaluatorTemplateOverride?: string;
|
|
664
860
|
readonly evaluator?: EvaluatorConfig;
|
|
665
|
-
/**
|
|
666
|
-
readonly
|
|
667
|
-
/** File path to trace data (alternative to inline candidateTrace) */
|
|
668
|
-
readonly candidateTraceRef?: string;
|
|
861
|
+
/** Output messages from agent execution (primary source for tool trajectory) */
|
|
862
|
+
readonly outputMessages?: readonly OutputMessage[];
|
|
669
863
|
/** Lightweight summary of trace events (if available) */
|
|
670
|
-
readonly
|
|
864
|
+
readonly traceSummary?: TraceSummary;
|
|
671
865
|
}
|
|
672
866
|
interface EvaluationScore {
|
|
673
867
|
readonly score: number;
|
|
@@ -737,6 +931,14 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
|
737
931
|
private readonly config;
|
|
738
932
|
constructor(options: ToolTrajectoryEvaluatorOptions);
|
|
739
933
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
934
|
+
/**
|
|
935
|
+
* Extract tool calls from output messages.
|
|
936
|
+
*/
|
|
937
|
+
private extractToolCallsFromMessages;
|
|
938
|
+
/**
|
|
939
|
+
* Build a summary from extracted tool calls.
|
|
940
|
+
*/
|
|
941
|
+
private buildSummary;
|
|
740
942
|
private evaluateAnyOrder;
|
|
741
943
|
private evaluateInOrder;
|
|
742
944
|
private evaluateExact;
|
|
@@ -831,4 +1033,4 @@ type AgentKernel = {
|
|
|
831
1033
|
};
|
|
832
1034
|
declare function createAgentKernel(): AgentKernel;
|
|
833
1035
|
|
|
834
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type
|
|
1036
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
|