@agentv/core 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1,9 +1,21 @@
1
+ import { z } from 'zod';
1
2
  import * as ai from 'ai';
2
3
 
3
4
  /**
4
5
  * Trace event types for capturing agent execution traces.
5
6
  * Provides a normalized, provider-agnostic model for tool-call trajectories.
6
7
  */
8
+ /**
9
+ * Token usage metrics from provider execution.
10
+ */
11
+ interface TokenUsage {
12
+ /** Input/prompt tokens consumed */
13
+ readonly input: number;
14
+ /** Output/completion tokens generated */
15
+ readonly output: number;
16
+ /** Cached tokens (optional, provider-specific) */
17
+ readonly cached?: number;
18
+ }
7
19
  /**
8
20
  * Compact summary of a trace for lightweight persistence.
9
21
  * Included in results by default to avoid payload bloat.
@@ -17,6 +29,14 @@ interface TraceSummary {
17
29
  readonly toolCallsByName: Readonly<Record<string, number>>;
18
30
  /** Number of error events */
19
31
  readonly errorCount: number;
32
+ /** Token usage metrics (optional, from provider) */
33
+ readonly tokenUsage?: TokenUsage;
34
+ /** Total cost in USD (optional, from provider) */
35
+ readonly costUsd?: number;
36
+ /** Total execution duration in milliseconds (optional) */
37
+ readonly durationMs?: number;
38
+ /** Per-tool duration arrays in milliseconds (optional) */
39
+ readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
20
40
  }
21
41
  /**
22
42
  * Configuration for tool_trajectory evaluator.
@@ -38,6 +58,8 @@ interface ToolTrajectoryEvaluatorConfig {
38
58
  */
39
59
  interface ToolTrajectoryExpectedItem {
40
60
  readonly tool: string;
61
+ /** Optional argument matching: 'any' skips validation, object performs partial deep equality */
62
+ readonly args?: 'any' | Record<string, unknown>;
41
63
  }
42
64
  /**
43
65
  * Simplified input type for computeTraceSummary.
@@ -53,6 +75,53 @@ interface OutputMessageLike {
53
75
  * Used for default result persistence without payload bloat.
54
76
  */
55
77
  declare function computeTraceSummary(messages: readonly OutputMessageLike[]): TraceSummary;
78
+ /**
79
+ * Default tool names considered as exploration/read-only operations.
80
+ * Can be overridden per-evaluation via config.
81
+ */
82
+ declare const DEFAULT_EXPLORATION_TOOLS: readonly ["read", "grep", "glob", "search", "list", "Read", "Grep", "Glob", "WebSearch", "WebFetch"];
83
+ /**
84
+ * Ratio of exploration tool calls to total tool calls.
85
+ * Returns undefined if there are no tool calls.
86
+ *
87
+ * @param summary - Trace summary with tool call counts
88
+ * @param explorationTools - Tool names considered exploration (defaults to DEFAULT_EXPLORATION_TOOLS)
89
+ * @returns Ratio between 0 and 1, or undefined if no tool calls
90
+ */
91
+ declare function explorationRatio(summary: TraceSummary, explorationTools?: readonly string[]): number | undefined;
92
+ /**
93
+ * Average tokens consumed per tool call.
94
+ * Returns undefined if tokenUsage is not available or no tool calls.
95
+ *
96
+ * @param summary - Trace summary with optional token usage
97
+ * @returns Average tokens per tool call, or undefined
98
+ */
99
+ declare function tokensPerTool(summary: TraceSummary): number | undefined;
100
+ /**
101
+ * Average tool duration across all tool calls.
102
+ * Returns undefined if toolDurations is not available or empty.
103
+ *
104
+ * @param summary - Trace summary with optional tool durations
105
+ * @returns Average duration in milliseconds, or undefined
106
+ */
107
+ declare function avgToolDurationMs(summary: TraceSummary): number | undefined;
108
+ /**
109
+ * Execution metrics from provider response.
110
+ */
111
+ interface ExecutionMetrics {
112
+ readonly tokenUsage?: TokenUsage;
113
+ readonly costUsd?: number;
114
+ readonly durationMs?: number;
115
+ }
116
+ /**
117
+ * Merge execution metrics from provider response into a trace summary.
118
+ * Returns a new TraceSummary with metrics fields populated.
119
+ *
120
+ * @param summary - Base trace summary from computeTraceSummary
121
+ * @param metrics - Optional execution metrics from provider
122
+ * @returns TraceSummary with merged metrics
123
+ */
124
+ declare function mergeExecutionMetrics(summary: TraceSummary, metrics?: ExecutionMetrics): TraceSummary;
56
125
 
57
126
  /**
58
127
  * JSON primitive values appearing in AgentV payloads.
@@ -204,23 +273,23 @@ interface EvalCase {
204
273
  */
205
274
  interface EvaluationResult {
206
275
  readonly timestamp: string;
207
- readonly eval_id: string;
276
+ readonly evalId: string;
208
277
  readonly dataset?: string;
209
- readonly conversation_id?: string;
278
+ readonly conversationId?: string;
210
279
  readonly score: number;
211
280
  readonly hits: readonly string[];
212
281
  readonly misses: readonly string[];
213
- readonly candidate_answer: string;
282
+ readonly candidateAnswer: string;
214
283
  readonly target: string;
215
284
  readonly reasoning?: string;
216
- readonly raw_aspects?: readonly string[];
217
- readonly agent_provider_request?: JsonObject;
218
- readonly lm_provider_request?: JsonObject;
219
- readonly evaluator_provider_request?: JsonObject;
220
- readonly evaluator_results?: readonly EvaluatorResult[];
285
+ readonly rawAspects?: readonly string[];
286
+ readonly agentProviderRequest?: JsonObject;
287
+ readonly lmProviderRequest?: JsonObject;
288
+ readonly evaluatorProviderRequest?: JsonObject;
289
+ readonly evaluatorResults?: readonly EvaluatorResult[];
221
290
  readonly error?: string;
222
291
  /** Lightweight summary of the execution trace (always included when available) */
223
- readonly trace_summary?: TraceSummary;
292
+ readonly traceSummary?: TraceSummary;
224
293
  }
225
294
  type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
226
295
  interface EvaluatorResult {
@@ -232,9 +301,9 @@ interface EvaluatorResult {
232
301
  readonly hits: readonly string[];
233
302
  readonly misses: readonly string[];
234
303
  readonly reasoning?: string;
235
- readonly raw_request?: JsonObject;
236
- readonly evaluator_provider_request?: JsonObject;
237
- readonly evaluator_results?: readonly EvaluatorResult[];
304
+ readonly rawRequest?: JsonObject;
305
+ readonly evaluatorProviderRequest?: JsonObject;
306
+ readonly evaluatorResults?: readonly EvaluatorResult[];
238
307
  }
239
308
  /**
240
309
  * Convenience accessor matching the Python hit_count property.
@@ -248,7 +317,7 @@ interface ChatMessage {
248
317
  readonly name?: string;
249
318
  }
250
319
  type ChatPrompt = readonly ChatMessage[];
251
- type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
320
+ type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
252
321
  interface ProviderRequest {
253
322
  readonly question: string;
254
323
  readonly systemPrompt?: string;
@@ -297,11 +366,28 @@ interface OutputMessage {
297
366
  /** Provider-specific metadata */
298
367
  readonly metadata?: Record<string, unknown>;
299
368
  }
369
+ /**
370
+ * Token usage metrics reported by provider.
371
+ */
372
+ interface ProviderTokenUsage {
373
+ /** Input/prompt tokens consumed */
374
+ readonly input: number;
375
+ /** Output/completion tokens generated */
376
+ readonly output: number;
377
+ /** Cached tokens (optional, provider-specific) */
378
+ readonly cached?: number;
379
+ }
300
380
  interface ProviderResponse {
301
381
  readonly raw?: unknown;
302
382
  readonly usage?: JsonObject;
303
383
  /** Output messages from agent execution (primary source for tool trajectory) */
304
384
  readonly outputMessages?: readonly OutputMessage[];
385
+ /** Token usage metrics (optional) */
386
+ readonly tokenUsage?: ProviderTokenUsage;
387
+ /** Total cost in USD (optional) */
388
+ readonly costUsd?: number;
389
+ /** Execution duration in milliseconds (optional) */
390
+ readonly durationMs?: number;
305
391
  }
306
392
  interface Provider {
307
393
  readonly id: string;
@@ -482,6 +568,101 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
482
568
  readonly attempted: readonly string[];
483
569
  }>;
484
570
 
571
+ /**
572
+ * Strict normalized schema for CLI target configuration.
573
+ * This is the final validated shape after environment variable resolution
574
+ * and snake_case to camelCase normalization.
575
+ *
576
+ * Uses .strict() to reject unknown properties, ensuring configuration
577
+ * errors are caught early rather than silently ignored.
578
+ *
579
+ * @example
580
+ * ```typescript
581
+ * const config: CliNormalizedConfig = {
582
+ * commandTemplate: 'agent run {PROMPT}',
583
+ * timeoutMs: 120000,
584
+ * verbose: true,
585
+ * };
586
+ * CliTargetConfigSchema.parse(config); // Validates the normalized config
587
+ * ```
588
+ */
589
+ declare const CliTargetConfigSchema: z.ZodObject<{
590
+ commandTemplate: z.ZodString;
591
+ filesFormat: z.ZodOptional<z.ZodString>;
592
+ cwd: z.ZodOptional<z.ZodString>;
593
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
594
+ healthcheck: z.ZodOptional<z.ZodDiscriminatedUnion<"type", [z.ZodObject<{
595
+ type: z.ZodLiteral<"http">;
596
+ url: z.ZodString;
597
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
598
+ }, "strict", z.ZodTypeAny, {
599
+ type: "http";
600
+ url: string;
601
+ timeoutMs?: number | undefined;
602
+ }, {
603
+ type: "http";
604
+ url: string;
605
+ timeoutMs?: number | undefined;
606
+ }>, z.ZodObject<{
607
+ type: z.ZodLiteral<"command">;
608
+ commandTemplate: z.ZodString;
609
+ cwd: z.ZodOptional<z.ZodString>;
610
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
611
+ }, "strict", z.ZodTypeAny, {
612
+ type: "command";
613
+ commandTemplate: string;
614
+ cwd?: string | undefined;
615
+ timeoutMs?: number | undefined;
616
+ }, {
617
+ type: "command";
618
+ commandTemplate: string;
619
+ cwd?: string | undefined;
620
+ timeoutMs?: number | undefined;
621
+ }>]>>;
622
+ verbose: z.ZodOptional<z.ZodBoolean>;
623
+ keepTempFiles: z.ZodOptional<z.ZodBoolean>;
624
+ }, "strict", z.ZodTypeAny, {
625
+ commandTemplate: string;
626
+ cwd?: string | undefined;
627
+ verbose?: boolean | undefined;
628
+ filesFormat?: string | undefined;
629
+ healthcheck?: {
630
+ type: "http";
631
+ url: string;
632
+ timeoutMs?: number | undefined;
633
+ } | {
634
+ type: "command";
635
+ commandTemplate: string;
636
+ cwd?: string | undefined;
637
+ timeoutMs?: number | undefined;
638
+ } | undefined;
639
+ keepTempFiles?: boolean | undefined;
640
+ timeoutMs?: number | undefined;
641
+ }, {
642
+ commandTemplate: string;
643
+ cwd?: string | undefined;
644
+ verbose?: boolean | undefined;
645
+ filesFormat?: string | undefined;
646
+ healthcheck?: {
647
+ type: "http";
648
+ url: string;
649
+ timeoutMs?: number | undefined;
650
+ } | {
651
+ type: "command";
652
+ commandTemplate: string;
653
+ cwd?: string | undefined;
654
+ timeoutMs?: number | undefined;
655
+ } | undefined;
656
+ keepTempFiles?: boolean | undefined;
657
+ timeoutMs?: number | undefined;
658
+ }>;
659
+ type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
660
+ /**
661
+ * Resolved CLI configuration type derived from CliTargetConfigSchema.
662
+ * This is the final validated shape used by the CLI provider at runtime.
663
+ * Using Readonly to ensure immutability for runtime safety.
664
+ */
665
+ type CliResolvedConfig = Readonly<CliNormalizedConfig>;
485
666
  interface RetryConfig {
486
667
  readonly maxRetries?: number;
487
668
  readonly initialDelayMs?: number;
@@ -529,6 +710,21 @@ interface CodexResolvedConfig {
529
710
  readonly timeoutMs?: number;
530
711
  readonly logDir?: string;
531
712
  readonly logFormat?: 'summary' | 'json';
713
+ readonly systemPrompt?: string;
714
+ }
715
+ interface PiCodingAgentResolvedConfig {
716
+ readonly executable: string;
717
+ readonly provider?: string;
718
+ readonly model?: string;
719
+ readonly apiKey?: string;
720
+ readonly tools?: string;
721
+ readonly thinking?: string;
722
+ readonly args?: readonly string[];
723
+ readonly cwd?: string;
724
+ readonly timeoutMs?: number;
725
+ readonly logDir?: string;
726
+ readonly logFormat?: 'summary' | 'json';
727
+ readonly systemPrompt?: string;
532
728
  }
533
729
  interface MockResolvedConfig {
534
730
  readonly response?: string;
@@ -543,25 +739,6 @@ interface VSCodeResolvedConfig {
543
739
  readonly subagentRoot?: string;
544
740
  readonly workspaceTemplate?: string;
545
741
  }
546
- type CliHealthcheck = {
547
- readonly type: 'http';
548
- readonly url: string;
549
- readonly timeoutMs?: number;
550
- } | {
551
- readonly type: 'command';
552
- readonly commandTemplate: string;
553
- readonly timeoutMs?: number;
554
- readonly cwd?: string;
555
- };
556
- interface CliResolvedConfig {
557
- readonly commandTemplate: string;
558
- readonly filesFormat?: string;
559
- readonly cwd?: string;
560
- readonly timeoutMs?: number;
561
- readonly healthcheck?: CliHealthcheck;
562
- readonly verbose?: boolean;
563
- readonly keepTempFiles?: boolean;
564
- }
565
742
  type ResolvedTarget = {
566
743
  readonly kind: 'azure';
567
744
  readonly name: string;
@@ -590,6 +767,13 @@ type ResolvedTarget = {
590
767
  readonly workers?: number;
591
768
  readonly providerBatching?: boolean;
592
769
  readonly config: CodexResolvedConfig;
770
+ } | {
771
+ readonly kind: 'pi-coding-agent';
772
+ readonly name: string;
773
+ readonly judgeTarget?: string;
774
+ readonly workers?: number;
775
+ readonly providerBatching?: boolean;
776
+ readonly config: PiCodingAgentResolvedConfig;
593
777
  } | {
594
778
  readonly kind: 'mock';
595
779
  readonly name: string;
@@ -645,6 +829,16 @@ type CodexLogListener = (entry: CodexLogEntry) => void;
645
829
  declare function consumeCodexLogEntries(): CodexLogEntry[];
646
830
  declare function subscribeToCodexLogEntries(listener: CodexLogListener): () => void;
647
831
 
832
+ type PiLogEntry = {
833
+ readonly filePath: string;
834
+ readonly evalCaseId?: string;
835
+ readonly targetName: string;
836
+ readonly attempt?: number;
837
+ };
838
+ type PiLogListener = (entry: PiLogEntry) => void;
839
+ declare function consumePiLogEntries(): PiLogEntry[];
840
+ declare function subscribeToPiLogEntries(listener: PiLogListener): () => void;
841
+
648
842
  declare function createProvider(target: ResolvedTarget): Provider;
649
843
  declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
650
844
 
@@ -839,4 +1033,4 @@ type AgentKernel = {
839
1033
  };
840
1034
  declare function createAgentKernel(): AgentKernel;
841
1035
 
842
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
1036
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };