@agentv/core 2.7.1-next.6 → 2.9.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -187,8 +187,6 @@ interface TargetDefinition {
187
187
  readonly subagentRoot?: string | unknown | undefined;
188
188
  readonly workspace_template?: string | unknown | undefined;
189
189
  readonly workspaceTemplate?: string | unknown | undefined;
190
- readonly command_template?: string | unknown | undefined;
191
- readonly commandTemplate?: string | unknown | undefined;
192
190
  readonly files_format?: string | unknown | undefined;
193
191
  readonly filesFormat?: string | unknown | undefined;
194
192
  readonly attachments_format?: string | unknown | undefined;
@@ -241,20 +239,22 @@ interface TraceSummary {
241
239
  readonly toolCallsByName: Readonly<Record<string, number>>;
242
240
  /** Number of error events */
243
241
  readonly errorCount: number;
244
- /** Token usage metrics (optional, from provider) */
242
+ /** Per-tool duration arrays in milliseconds (optional) */
243
+ readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
244
+ /** Number of LLM calls (assistant messages) */
245
+ readonly llmCallCount?: number;
246
+ }
247
+ /**
248
+ * Combined result of trace computation + execution metrics merge.
249
+ * Returned by computeTraceSummaryWithMetrics().
250
+ */
251
+ interface TraceComputeResult {
252
+ readonly trace: TraceSummary;
245
253
  readonly tokenUsage?: TokenUsage;
246
- /** Total cost in USD (optional, from provider) */
247
254
  readonly costUsd?: number;
248
- /** Total execution duration in milliseconds (optional) */
249
255
  readonly durationMs?: number;
250
- /** Per-tool duration arrays in milliseconds (optional) */
251
- readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
252
- /** ISO 8601 timestamp when execution started (derived from earliest span) */
253
256
  readonly startTime?: string;
254
- /** ISO 8601 timestamp when execution ended (derived from latest span) */
255
257
  readonly endTime?: string;
256
- /** Number of LLM calls (assistant messages) */
257
- readonly llmCallCount?: number;
258
258
  }
259
259
  /**
260
260
  * Argument matching mode for tool_trajectory expected items.
@@ -321,7 +321,7 @@ interface MessageLike {
321
321
  * - toolDurations: per-tool duration arrays (from durationMs or computed from start/end)
322
322
  * - llmCallCount: count of assistant messages
323
323
  */
324
- declare function computeTraceSummary(messages: readonly MessageLike[]): TraceSummary;
324
+ declare function computeTraceSummary(messages: readonly MessageLike[]): TraceComputeResult;
325
325
  /**
326
326
  * Default tool names considered as exploration/read-only operations.
327
327
  * Can be overridden per-evaluation via config.
@@ -343,7 +343,7 @@ declare function explorationRatio(summary: TraceSummary, explorationTools?: read
343
343
  * @param summary - Trace summary with optional token usage
344
344
  * @returns Average tokens per tool call, or undefined
345
345
  */
346
- declare function tokensPerTool(summary: TraceSummary): number | undefined;
346
+ declare function tokensPerTool(summary: TraceSummary, tokenUsage?: TokenUsage): number | undefined;
347
347
  /**
348
348
  * Average tool duration across all tool calls.
349
349
  * Returns undefined if toolDurations is not available or empty.
@@ -365,15 +365,15 @@ interface ExecutionMetrics {
365
365
  readonly endTime?: string;
366
366
  }
367
367
  /**
368
- * Merge execution metrics from provider response into a trace summary.
369
- * Returns a new TraceSummary with metrics fields populated.
368
+ * Merge execution metrics from provider response into a trace compute result.
369
+ * Returns a new TraceComputeResult with metrics fields populated.
370
370
  * Provider-level timing takes precedence over span-derived timing.
371
371
  *
372
- * @param summary - Base trace summary from computeTraceSummary
372
+ * @param computed - Base trace compute result from computeTraceSummary
373
373
  * @param metrics - Optional execution metrics from provider
374
- * @returns TraceSummary with merged metrics
374
+ * @returns TraceComputeResult with merged metrics
375
375
  */
376
- declare function mergeExecutionMetrics(summary: TraceSummary, metrics?: ExecutionMetrics): TraceSummary;
376
+ declare function mergeExecutionMetrics(computed: TraceComputeResult, metrics?: ExecutionMetrics): TraceComputeResult;
377
377
 
378
378
  /**
379
379
  * JSON primitive values appearing in AgentV payloads.
@@ -924,6 +924,16 @@ interface EvaluationResult {
924
924
  readonly answer: string;
925
925
  readonly target: string;
926
926
  readonly reasoning?: string;
927
+ /** Token usage metrics from provider (optional) */
928
+ readonly tokenUsage?: TokenUsage;
929
+ /** Total cost in USD (optional, from provider) */
930
+ readonly costUsd?: number;
931
+ /** Total execution duration in milliseconds (optional) */
932
+ readonly durationMs?: number;
933
+ /** ISO 8601 timestamp when execution started */
934
+ readonly startTime?: string;
935
+ /** ISO 8601 timestamp when execution ended */
936
+ readonly endTime?: string;
927
937
  readonly requests?: {
928
938
  readonly agent?: JsonObject;
929
939
  readonly lm?: JsonObject;
@@ -955,6 +965,8 @@ interface EvaluationResult {
955
965
  readonly aggregation?: TrialAggregation;
956
966
  /** Whether the trial loop was terminated early due to cost limit */
957
967
  readonly costLimited?: boolean;
968
+ /** Whether the evaluation was skipped due to suite-level budget exhaustion */
969
+ readonly budgetExceeded?: boolean;
958
970
  }
959
971
  type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
960
972
  interface EvaluatorResult {
@@ -971,6 +983,8 @@ interface EvaluatorResult {
971
983
  readonly scores?: readonly EvaluatorResult[];
972
984
  /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
973
985
  readonly details?: JsonObject;
986
+ /** Token usage from LLM calls made by this evaluator (optional). */
987
+ readonly tokenUsage?: TokenUsage;
974
988
  }
975
989
  /**
976
990
  * Convenience accessor matching the Python hit_count property.
@@ -1116,6 +1130,8 @@ type EvalSuiteResult = {
1116
1130
  readonly cacheConfig?: CacheConfig;
1117
1131
  /** Suite-level metadata (name, description, version, etc.) */
1118
1132
  readonly metadata?: EvalMetadata;
1133
+ /** Suite-level total cost budget in USD */
1134
+ readonly totalBudgetUsd?: number;
1119
1135
  };
1120
1136
  /**
1121
1137
  * Load tests and suite metadata from a single parse.
@@ -1637,6 +1653,16 @@ interface EvaluationContext {
1637
1653
  readonly output?: readonly Message[];
1638
1654
  /** Lightweight summary of trace events (if available) */
1639
1655
  readonly trace?: TraceSummary;
1656
+ /** Token usage from provider execution (promoted from TraceSummary) */
1657
+ readonly tokenUsage?: TokenUsage;
1658
+ /** Total cost in USD (from provider) */
1659
+ readonly costUsd?: number;
1660
+ /** Execution duration in milliseconds */
1661
+ readonly durationMs?: number;
1662
+ /** ISO 8601 timestamp when execution started */
1663
+ readonly startTime?: string;
1664
+ /** ISO 8601 timestamp when execution ended */
1665
+ readonly endTime?: string;
1640
1666
  /** Resolver for target override in code judges */
1641
1667
  readonly targetResolver?: TargetResolver;
1642
1668
  /** List of available target names for code judges */
@@ -1657,6 +1683,8 @@ interface EvaluationScore {
1657
1683
  readonly scores?: readonly ChildEvaluatorResult[];
1658
1684
  /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
1659
1685
  readonly details?: JsonObject;
1686
+ /** Token usage from LLM calls made by this evaluator (optional). */
1687
+ readonly tokenUsage?: TokenUsage;
1660
1688
  }
1661
1689
  interface ChildEvaluatorResult {
1662
1690
  readonly name: string;
@@ -1671,6 +1699,8 @@ interface ChildEvaluatorResult {
1671
1699
  readonly scores?: readonly ChildEvaluatorResult[];
1672
1700
  /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
1673
1701
  readonly details?: JsonObject;
1702
+ /** Token usage from LLM calls made by this evaluator (optional). */
1703
+ readonly tokenUsage?: TokenUsage;
1674
1704
  }
1675
1705
  interface Evaluator {
1676
1706
  readonly kind: string;
@@ -1740,7 +1770,7 @@ interface CostEvaluatorOptions {
1740
1770
  }
1741
1771
  /**
1742
1772
  * Evaluator that checks execution cost against a budget.
1743
- * Uses trace.costUsd from the evaluation context.
1773
+ * Uses costUsd from the evaluation context.
1744
1774
  */
1745
1775
  declare class CostEvaluator implements Evaluator {
1746
1776
  readonly kind = "cost";
@@ -1812,7 +1842,7 @@ interface LatencyEvaluatorOptions {
1812
1842
  }
1813
1843
  /**
1814
1844
  * Evaluator that checks execution duration against a threshold.
1815
- * Uses trace.durationMs from the evaluation context.
1845
+ * Uses durationMs from the evaluation context.
1816
1846
  */
1817
1847
  declare class LatencyEvaluator implements Evaluator {
1818
1848
  readonly kind = "latency";
@@ -1987,7 +2017,7 @@ interface TokenUsageEvaluatorOptions {
1987
2017
  }
1988
2018
  /**
1989
2019
  * Evaluator that checks provider-reported token usage against configured limits.
1990
- * Uses trace.tokenUsage from the evaluation context.
2020
+ * Uses tokenUsage from the evaluation context.
1991
2021
  */
1992
2022
  declare class TokenUsageEvaluator implements Evaluator {
1993
2023
  readonly kind = "token_usage";
@@ -2196,6 +2226,8 @@ interface RunEvaluationOptions {
2196
2226
  readonly trials?: TrialsConfig;
2197
2227
  /** Real-time observability callbacks passed to the provider */
2198
2228
  readonly streamCallbacks?: ProviderStreamCallbacks;
2229
+ /** Suite-level total cost budget in USD (stops dispatching when exceeded) */
2230
+ readonly totalBudgetUsd?: number;
2199
2231
  }
2200
2232
  declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
2201
2233
  declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
@@ -2886,4 +2918,4 @@ type AgentKernel = {
2886
2918
  };
2887
2919
  declare function createAgentKernel(): AgentKernel;
2888
2920
 
2889
- export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
2921
+ export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
package/dist/index.d.ts CHANGED
@@ -187,8 +187,6 @@ interface TargetDefinition {
187
187
  readonly subagentRoot?: string | unknown | undefined;
188
188
  readonly workspace_template?: string | unknown | undefined;
189
189
  readonly workspaceTemplate?: string | unknown | undefined;
190
- readonly command_template?: string | unknown | undefined;
191
- readonly commandTemplate?: string | unknown | undefined;
192
190
  readonly files_format?: string | unknown | undefined;
193
191
  readonly filesFormat?: string | unknown | undefined;
194
192
  readonly attachments_format?: string | unknown | undefined;
@@ -241,20 +239,22 @@ interface TraceSummary {
241
239
  readonly toolCallsByName: Readonly<Record<string, number>>;
242
240
  /** Number of error events */
243
241
  readonly errorCount: number;
244
- /** Token usage metrics (optional, from provider) */
242
+ /** Per-tool duration arrays in milliseconds (optional) */
243
+ readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
244
+ /** Number of LLM calls (assistant messages) */
245
+ readonly llmCallCount?: number;
246
+ }
247
+ /**
248
+ * Combined result of trace computation + execution metrics merge.
249
+ * Returned by computeTraceSummaryWithMetrics().
250
+ */
251
+ interface TraceComputeResult {
252
+ readonly trace: TraceSummary;
245
253
  readonly tokenUsage?: TokenUsage;
246
- /** Total cost in USD (optional, from provider) */
247
254
  readonly costUsd?: number;
248
- /** Total execution duration in milliseconds (optional) */
249
255
  readonly durationMs?: number;
250
- /** Per-tool duration arrays in milliseconds (optional) */
251
- readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
252
- /** ISO 8601 timestamp when execution started (derived from earliest span) */
253
256
  readonly startTime?: string;
254
- /** ISO 8601 timestamp when execution ended (derived from latest span) */
255
257
  readonly endTime?: string;
256
- /** Number of LLM calls (assistant messages) */
257
- readonly llmCallCount?: number;
258
258
  }
259
259
  /**
260
260
  * Argument matching mode for tool_trajectory expected items.
@@ -321,7 +321,7 @@ interface MessageLike {
321
321
  * - toolDurations: per-tool duration arrays (from durationMs or computed from start/end)
322
322
  * - llmCallCount: count of assistant messages
323
323
  */
324
- declare function computeTraceSummary(messages: readonly MessageLike[]): TraceSummary;
324
+ declare function computeTraceSummary(messages: readonly MessageLike[]): TraceComputeResult;
325
325
  /**
326
326
  * Default tool names considered as exploration/read-only operations.
327
327
  * Can be overridden per-evaluation via config.
@@ -343,7 +343,7 @@ declare function explorationRatio(summary: TraceSummary, explorationTools?: read
343
343
  * @param summary - Trace summary with optional token usage
344
344
  * @returns Average tokens per tool call, or undefined
345
345
  */
346
- declare function tokensPerTool(summary: TraceSummary): number | undefined;
346
+ declare function tokensPerTool(summary: TraceSummary, tokenUsage?: TokenUsage): number | undefined;
347
347
  /**
348
348
  * Average tool duration across all tool calls.
349
349
  * Returns undefined if toolDurations is not available or empty.
@@ -365,15 +365,15 @@ interface ExecutionMetrics {
365
365
  readonly endTime?: string;
366
366
  }
367
367
  /**
368
- * Merge execution metrics from provider response into a trace summary.
369
- * Returns a new TraceSummary with metrics fields populated.
368
+ * Merge execution metrics from provider response into a trace compute result.
369
+ * Returns a new TraceComputeResult with metrics fields populated.
370
370
  * Provider-level timing takes precedence over span-derived timing.
371
371
  *
372
- * @param summary - Base trace summary from computeTraceSummary
372
+ * @param computed - Base trace compute result from computeTraceSummary
373
373
  * @param metrics - Optional execution metrics from provider
374
- * @returns TraceSummary with merged metrics
374
+ * @returns TraceComputeResult with merged metrics
375
375
  */
376
- declare function mergeExecutionMetrics(summary: TraceSummary, metrics?: ExecutionMetrics): TraceSummary;
376
+ declare function mergeExecutionMetrics(computed: TraceComputeResult, metrics?: ExecutionMetrics): TraceComputeResult;
377
377
 
378
378
  /**
379
379
  * JSON primitive values appearing in AgentV payloads.
@@ -924,6 +924,16 @@ interface EvaluationResult {
924
924
  readonly answer: string;
925
925
  readonly target: string;
926
926
  readonly reasoning?: string;
927
+ /** Token usage metrics from provider (optional) */
928
+ readonly tokenUsage?: TokenUsage;
929
+ /** Total cost in USD (optional, from provider) */
930
+ readonly costUsd?: number;
931
+ /** Total execution duration in milliseconds (optional) */
932
+ readonly durationMs?: number;
933
+ /** ISO 8601 timestamp when execution started */
934
+ readonly startTime?: string;
935
+ /** ISO 8601 timestamp when execution ended */
936
+ readonly endTime?: string;
927
937
  readonly requests?: {
928
938
  readonly agent?: JsonObject;
929
939
  readonly lm?: JsonObject;
@@ -955,6 +965,8 @@ interface EvaluationResult {
955
965
  readonly aggregation?: TrialAggregation;
956
966
  /** Whether the trial loop was terminated early due to cost limit */
957
967
  readonly costLimited?: boolean;
968
+ /** Whether the evaluation was skipped due to suite-level budget exhaustion */
969
+ readonly budgetExceeded?: boolean;
958
970
  }
959
971
  type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
960
972
  interface EvaluatorResult {
@@ -971,6 +983,8 @@ interface EvaluatorResult {
971
983
  readonly scores?: readonly EvaluatorResult[];
972
984
  /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
973
985
  readonly details?: JsonObject;
986
+ /** Token usage from LLM calls made by this evaluator (optional). */
987
+ readonly tokenUsage?: TokenUsage;
974
988
  }
975
989
  /**
976
990
  * Convenience accessor matching the Python hit_count property.
@@ -1116,6 +1130,8 @@ type EvalSuiteResult = {
1116
1130
  readonly cacheConfig?: CacheConfig;
1117
1131
  /** Suite-level metadata (name, description, version, etc.) */
1118
1132
  readonly metadata?: EvalMetadata;
1133
+ /** Suite-level total cost budget in USD */
1134
+ readonly totalBudgetUsd?: number;
1119
1135
  };
1120
1136
  /**
1121
1137
  * Load tests and suite metadata from a single parse.
@@ -1637,6 +1653,16 @@ interface EvaluationContext {
1637
1653
  readonly output?: readonly Message[];
1638
1654
  /** Lightweight summary of trace events (if available) */
1639
1655
  readonly trace?: TraceSummary;
1656
+ /** Token usage from provider execution (promoted from TraceSummary) */
1657
+ readonly tokenUsage?: TokenUsage;
1658
+ /** Total cost in USD (from provider) */
1659
+ readonly costUsd?: number;
1660
+ /** Execution duration in milliseconds */
1661
+ readonly durationMs?: number;
1662
+ /** ISO 8601 timestamp when execution started */
1663
+ readonly startTime?: string;
1664
+ /** ISO 8601 timestamp when execution ended */
1665
+ readonly endTime?: string;
1640
1666
  /** Resolver for target override in code judges */
1641
1667
  readonly targetResolver?: TargetResolver;
1642
1668
  /** List of available target names for code judges */
@@ -1657,6 +1683,8 @@ interface EvaluationScore {
1657
1683
  readonly scores?: readonly ChildEvaluatorResult[];
1658
1684
  /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
1659
1685
  readonly details?: JsonObject;
1686
+ /** Token usage from LLM calls made by this evaluator (optional). */
1687
+ readonly tokenUsage?: TokenUsage;
1660
1688
  }
1661
1689
  interface ChildEvaluatorResult {
1662
1690
  readonly name: string;
@@ -1671,6 +1699,8 @@ interface ChildEvaluatorResult {
1671
1699
  readonly scores?: readonly ChildEvaluatorResult[];
1672
1700
  /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
1673
1701
  readonly details?: JsonObject;
1702
+ /** Token usage from LLM calls made by this evaluator (optional). */
1703
+ readonly tokenUsage?: TokenUsage;
1674
1704
  }
1675
1705
  interface Evaluator {
1676
1706
  readonly kind: string;
@@ -1740,7 +1770,7 @@ interface CostEvaluatorOptions {
1740
1770
  }
1741
1771
  /**
1742
1772
  * Evaluator that checks execution cost against a budget.
1743
- * Uses trace.costUsd from the evaluation context.
1773
+ * Uses costUsd from the evaluation context.
1744
1774
  */
1745
1775
  declare class CostEvaluator implements Evaluator {
1746
1776
  readonly kind = "cost";
@@ -1812,7 +1842,7 @@ interface LatencyEvaluatorOptions {
1812
1842
  }
1813
1843
  /**
1814
1844
  * Evaluator that checks execution duration against a threshold.
1815
- * Uses trace.durationMs from the evaluation context.
1845
+ * Uses durationMs from the evaluation context.
1816
1846
  */
1817
1847
  declare class LatencyEvaluator implements Evaluator {
1818
1848
  readonly kind = "latency";
@@ -1987,7 +2017,7 @@ interface TokenUsageEvaluatorOptions {
1987
2017
  }
1988
2018
  /**
1989
2019
  * Evaluator that checks provider-reported token usage against configured limits.
1990
- * Uses trace.tokenUsage from the evaluation context.
2020
+ * Uses tokenUsage from the evaluation context.
1991
2021
  */
1992
2022
  declare class TokenUsageEvaluator implements Evaluator {
1993
2023
  readonly kind = "token_usage";
@@ -2196,6 +2226,8 @@ interface RunEvaluationOptions {
2196
2226
  readonly trials?: TrialsConfig;
2197
2227
  /** Real-time observability callbacks passed to the provider */
2198
2228
  readonly streamCallbacks?: ProviderStreamCallbacks;
2229
+ /** Suite-level total cost budget in USD (stops dispatching when exceeded) */
2230
+ readonly totalBudgetUsd?: number;
2199
2231
  }
2200
2232
  declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
2201
2233
  declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
@@ -2886,4 +2918,4 @@ type AgentKernel = {
2886
2918
  };
2887
2919
  declare function createAgentKernel(): AgentKernel;
2888
2920
 
2889
- export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
2921
+ export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };