@agentv/core 2.8.0-next.1 → 2.9.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -239,20 +239,22 @@ interface TraceSummary {
239
239
  readonly toolCallsByName: Readonly<Record<string, number>>;
240
240
  /** Number of error events */
241
241
  readonly errorCount: number;
242
- /** Token usage metrics (optional, from provider) */
242
+ /** Per-tool duration arrays in milliseconds (optional) */
243
+ readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
244
+ /** Number of LLM calls (assistant messages) */
245
+ readonly llmCallCount?: number;
246
+ }
247
+ /**
248
+ * Combined result of trace computation + execution metrics merge.
249
+ * Returned by computeTraceSummaryWithMetrics().
250
+ */
251
+ interface TraceComputeResult {
252
+ readonly trace: TraceSummary;
243
253
  readonly tokenUsage?: TokenUsage;
244
- /** Total cost in USD (optional, from provider) */
245
254
  readonly costUsd?: number;
246
- /** Total execution duration in milliseconds (optional) */
247
255
  readonly durationMs?: number;
248
- /** Per-tool duration arrays in milliseconds (optional) */
249
- readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
250
- /** ISO 8601 timestamp when execution started (derived from earliest span) */
251
256
  readonly startTime?: string;
252
- /** ISO 8601 timestamp when execution ended (derived from latest span) */
253
257
  readonly endTime?: string;
254
- /** Number of LLM calls (assistant messages) */
255
- readonly llmCallCount?: number;
256
258
  }
257
259
  /**
258
260
  * Argument matching mode for tool_trajectory expected items.
@@ -319,7 +321,7 @@ interface MessageLike {
319
321
  * - toolDurations: per-tool duration arrays (from durationMs or computed from start/end)
320
322
  * - llmCallCount: count of assistant messages
321
323
  */
322
- declare function computeTraceSummary(messages: readonly MessageLike[]): TraceSummary;
324
+ declare function computeTraceSummary(messages: readonly MessageLike[]): TraceComputeResult;
323
325
  /**
324
326
  * Default tool names considered as exploration/read-only operations.
325
327
  * Can be overridden per-evaluation via config.
@@ -341,7 +343,7 @@ declare function explorationRatio(summary: TraceSummary, explorationTools?: read
341
343
  * @param summary - Trace summary with optional token usage
342
344
  * @returns Average tokens per tool call, or undefined
343
345
  */
344
- declare function tokensPerTool(summary: TraceSummary): number | undefined;
346
+ declare function tokensPerTool(summary: TraceSummary, tokenUsage?: TokenUsage): number | undefined;
345
347
  /**
346
348
  * Average tool duration across all tool calls.
347
349
  * Returns undefined if toolDurations is not available or empty.
@@ -363,15 +365,15 @@ interface ExecutionMetrics {
363
365
  readonly endTime?: string;
364
366
  }
365
367
  /**
366
- * Merge execution metrics from provider response into a trace summary.
367
- * Returns a new TraceSummary with metrics fields populated.
368
+ * Merge execution metrics from provider response into a trace compute result.
369
+ * Returns a new TraceComputeResult with metrics fields populated.
368
370
  * Provider-level timing takes precedence over span-derived timing.
369
371
  *
370
- * @param summary - Base trace summary from computeTraceSummary
372
+ * @param computed - Base trace compute result from computeTraceSummary
371
373
  * @param metrics - Optional execution metrics from provider
372
- * @returns TraceSummary with merged metrics
374
+ * @returns TraceComputeResult with merged metrics
373
375
  */
374
- declare function mergeExecutionMetrics(summary: TraceSummary, metrics?: ExecutionMetrics): TraceSummary;
376
+ declare function mergeExecutionMetrics(computed: TraceComputeResult, metrics?: ExecutionMetrics): TraceComputeResult;
375
377
 
376
378
  /**
377
379
  * JSON primitive values appearing in AgentV payloads.
@@ -922,6 +924,16 @@ interface EvaluationResult {
922
924
  readonly answer: string;
923
925
  readonly target: string;
924
926
  readonly reasoning?: string;
927
+ /** Token usage metrics from provider (optional) */
928
+ readonly tokenUsage?: TokenUsage;
929
+ /** Total cost in USD (optional, from provider) */
930
+ readonly costUsd?: number;
931
+ /** Total execution duration in milliseconds (optional) */
932
+ readonly durationMs?: number;
933
+ /** ISO 8601 timestamp when execution started */
934
+ readonly startTime?: string;
935
+ /** ISO 8601 timestamp when execution ended */
936
+ readonly endTime?: string;
925
937
  readonly requests?: {
926
938
  readonly agent?: JsonObject;
927
939
  readonly lm?: JsonObject;
@@ -971,6 +983,8 @@ interface EvaluatorResult {
971
983
  readonly scores?: readonly EvaluatorResult[];
972
984
  /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
973
985
  readonly details?: JsonObject;
986
+ /** Token usage from LLM calls made by this evaluator (optional). */
987
+ readonly tokenUsage?: TokenUsage;
974
988
  }
975
989
  /**
976
990
  * Convenience accessor matching the Python hit_count property.
@@ -1639,6 +1653,16 @@ interface EvaluationContext {
1639
1653
  readonly output?: readonly Message[];
1640
1654
  /** Lightweight summary of trace events (if available) */
1641
1655
  readonly trace?: TraceSummary;
1656
+ /** Token usage from provider execution (promoted from TraceSummary) */
1657
+ readonly tokenUsage?: TokenUsage;
1658
+ /** Total cost in USD (from provider) */
1659
+ readonly costUsd?: number;
1660
+ /** Execution duration in milliseconds */
1661
+ readonly durationMs?: number;
1662
+ /** ISO 8601 timestamp when execution started */
1663
+ readonly startTime?: string;
1664
+ /** ISO 8601 timestamp when execution ended */
1665
+ readonly endTime?: string;
1642
1666
  /** Resolver for target override in code judges */
1643
1667
  readonly targetResolver?: TargetResolver;
1644
1668
  /** List of available target names for code judges */
@@ -1659,6 +1683,8 @@ interface EvaluationScore {
1659
1683
  readonly scores?: readonly ChildEvaluatorResult[];
1660
1684
  /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
1661
1685
  readonly details?: JsonObject;
1686
+ /** Token usage from LLM calls made by this evaluator (optional). */
1687
+ readonly tokenUsage?: TokenUsage;
1662
1688
  }
1663
1689
  interface ChildEvaluatorResult {
1664
1690
  readonly name: string;
@@ -1673,6 +1699,8 @@ interface ChildEvaluatorResult {
1673
1699
  readonly scores?: readonly ChildEvaluatorResult[];
1674
1700
  /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
1675
1701
  readonly details?: JsonObject;
1702
+ /** Token usage from LLM calls made by this evaluator (optional). */
1703
+ readonly tokenUsage?: TokenUsage;
1676
1704
  }
1677
1705
  interface Evaluator {
1678
1706
  readonly kind: string;
@@ -1742,7 +1770,7 @@ interface CostEvaluatorOptions {
1742
1770
  }
1743
1771
  /**
1744
1772
  * Evaluator that checks execution cost against a budget.
1745
- * Uses trace.costUsd from the evaluation context.
1773
+ * Uses costUsd from the evaluation context.
1746
1774
  */
1747
1775
  declare class CostEvaluator implements Evaluator {
1748
1776
  readonly kind = "cost";
@@ -1814,7 +1842,7 @@ interface LatencyEvaluatorOptions {
1814
1842
  }
1815
1843
  /**
1816
1844
  * Evaluator that checks execution duration against a threshold.
1817
- * Uses trace.durationMs from the evaluation context.
1845
+ * Uses durationMs from the evaluation context.
1818
1846
  */
1819
1847
  declare class LatencyEvaluator implements Evaluator {
1820
1848
  readonly kind = "latency";
@@ -1989,7 +2017,7 @@ interface TokenUsageEvaluatorOptions {
1989
2017
  }
1990
2018
  /**
1991
2019
  * Evaluator that checks provider-reported token usage against configured limits.
1992
- * Uses trace.tokenUsage from the evaluation context.
2020
+ * Uses tokenUsage from the evaluation context.
1993
2021
  */
1994
2022
  declare class TokenUsageEvaluator implements Evaluator {
1995
2023
  readonly kind = "token_usage";
@@ -2890,4 +2918,4 @@ type AgentKernel = {
2890
2918
  };
2891
2919
  declare function createAgentKernel(): AgentKernel;
2892
2920
 
2893
- export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
2921
+ export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
package/dist/index.d.ts CHANGED
@@ -239,20 +239,22 @@ interface TraceSummary {
239
239
  readonly toolCallsByName: Readonly<Record<string, number>>;
240
240
  /** Number of error events */
241
241
  readonly errorCount: number;
242
- /** Token usage metrics (optional, from provider) */
242
+ /** Per-tool duration arrays in milliseconds (optional) */
243
+ readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
244
+ /** Number of LLM calls (assistant messages) */
245
+ readonly llmCallCount?: number;
246
+ }
247
+ /**
248
+ * Combined result of trace computation + execution metrics merge.
249
+ * Returned by computeTraceSummaryWithMetrics().
250
+ */
251
+ interface TraceComputeResult {
252
+ readonly trace: TraceSummary;
243
253
  readonly tokenUsage?: TokenUsage;
244
- /** Total cost in USD (optional, from provider) */
245
254
  readonly costUsd?: number;
246
- /** Total execution duration in milliseconds (optional) */
247
255
  readonly durationMs?: number;
248
- /** Per-tool duration arrays in milliseconds (optional) */
249
- readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
250
- /** ISO 8601 timestamp when execution started (derived from earliest span) */
251
256
  readonly startTime?: string;
252
- /** ISO 8601 timestamp when execution ended (derived from latest span) */
253
257
  readonly endTime?: string;
254
- /** Number of LLM calls (assistant messages) */
255
- readonly llmCallCount?: number;
256
258
  }
257
259
  /**
258
260
  * Argument matching mode for tool_trajectory expected items.
@@ -319,7 +321,7 @@ interface MessageLike {
319
321
  * - toolDurations: per-tool duration arrays (from durationMs or computed from start/end)
320
322
  * - llmCallCount: count of assistant messages
321
323
  */
322
- declare function computeTraceSummary(messages: readonly MessageLike[]): TraceSummary;
324
+ declare function computeTraceSummary(messages: readonly MessageLike[]): TraceComputeResult;
323
325
  /**
324
326
  * Default tool names considered as exploration/read-only operations.
325
327
  * Can be overridden per-evaluation via config.
@@ -341,7 +343,7 @@ declare function explorationRatio(summary: TraceSummary, explorationTools?: read
341
343
  * @param summary - Trace summary with optional token usage
342
344
  * @returns Average tokens per tool call, or undefined
343
345
  */
344
- declare function tokensPerTool(summary: TraceSummary): number | undefined;
346
+ declare function tokensPerTool(summary: TraceSummary, tokenUsage?: TokenUsage): number | undefined;
345
347
  /**
346
348
  * Average tool duration across all tool calls.
347
349
  * Returns undefined if toolDurations is not available or empty.
@@ -363,15 +365,15 @@ interface ExecutionMetrics {
363
365
  readonly endTime?: string;
364
366
  }
365
367
  /**
366
- * Merge execution metrics from provider response into a trace summary.
367
- * Returns a new TraceSummary with metrics fields populated.
368
+ * Merge execution metrics from provider response into a trace compute result.
369
+ * Returns a new TraceComputeResult with metrics fields populated.
368
370
  * Provider-level timing takes precedence over span-derived timing.
369
371
  *
370
- * @param summary - Base trace summary from computeTraceSummary
372
+ * @param computed - Base trace compute result from computeTraceSummary
371
373
  * @param metrics - Optional execution metrics from provider
372
- * @returns TraceSummary with merged metrics
374
+ * @returns TraceComputeResult with merged metrics
373
375
  */
374
- declare function mergeExecutionMetrics(summary: TraceSummary, metrics?: ExecutionMetrics): TraceSummary;
376
+ declare function mergeExecutionMetrics(computed: TraceComputeResult, metrics?: ExecutionMetrics): TraceComputeResult;
375
377
 
376
378
  /**
377
379
  * JSON primitive values appearing in AgentV payloads.
@@ -922,6 +924,16 @@ interface EvaluationResult {
922
924
  readonly answer: string;
923
925
  readonly target: string;
924
926
  readonly reasoning?: string;
927
+ /** Token usage metrics from provider (optional) */
928
+ readonly tokenUsage?: TokenUsage;
929
+ /** Total cost in USD (optional, from provider) */
930
+ readonly costUsd?: number;
931
+ /** Total execution duration in milliseconds (optional) */
932
+ readonly durationMs?: number;
933
+ /** ISO 8601 timestamp when execution started */
934
+ readonly startTime?: string;
935
+ /** ISO 8601 timestamp when execution ended */
936
+ readonly endTime?: string;
925
937
  readonly requests?: {
926
938
  readonly agent?: JsonObject;
927
939
  readonly lm?: JsonObject;
@@ -971,6 +983,8 @@ interface EvaluatorResult {
971
983
  readonly scores?: readonly EvaluatorResult[];
972
984
  /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
973
985
  readonly details?: JsonObject;
986
+ /** Token usage from LLM calls made by this evaluator (optional). */
987
+ readonly tokenUsage?: TokenUsage;
974
988
  }
975
989
  /**
976
990
  * Convenience accessor matching the Python hit_count property.
@@ -1639,6 +1653,16 @@ interface EvaluationContext {
1639
1653
  readonly output?: readonly Message[];
1640
1654
  /** Lightweight summary of trace events (if available) */
1641
1655
  readonly trace?: TraceSummary;
1656
+ /** Token usage from provider execution (promoted from TraceSummary) */
1657
+ readonly tokenUsage?: TokenUsage;
1658
+ /** Total cost in USD (from provider) */
1659
+ readonly costUsd?: number;
1660
+ /** Execution duration in milliseconds */
1661
+ readonly durationMs?: number;
1662
+ /** ISO 8601 timestamp when execution started */
1663
+ readonly startTime?: string;
1664
+ /** ISO 8601 timestamp when execution ended */
1665
+ readonly endTime?: string;
1642
1666
  /** Resolver for target override in code judges */
1643
1667
  readonly targetResolver?: TargetResolver;
1644
1668
  /** List of available target names for code judges */
@@ -1659,6 +1683,8 @@ interface EvaluationScore {
1659
1683
  readonly scores?: readonly ChildEvaluatorResult[];
1660
1684
  /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
1661
1685
  readonly details?: JsonObject;
1686
+ /** Token usage from LLM calls made by this evaluator (optional). */
1687
+ readonly tokenUsage?: TokenUsage;
1662
1688
  }
1663
1689
  interface ChildEvaluatorResult {
1664
1690
  readonly name: string;
@@ -1673,6 +1699,8 @@ interface ChildEvaluatorResult {
1673
1699
  readonly scores?: readonly ChildEvaluatorResult[];
1674
1700
  /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
1675
1701
  readonly details?: JsonObject;
1702
+ /** Token usage from LLM calls made by this evaluator (optional). */
1703
+ readonly tokenUsage?: TokenUsage;
1676
1704
  }
1677
1705
  interface Evaluator {
1678
1706
  readonly kind: string;
@@ -1742,7 +1770,7 @@ interface CostEvaluatorOptions {
1742
1770
  }
1743
1771
  /**
1744
1772
  * Evaluator that checks execution cost against a budget.
1745
- * Uses trace.costUsd from the evaluation context.
1773
+ * Uses costUsd from the evaluation context.
1746
1774
  */
1747
1775
  declare class CostEvaluator implements Evaluator {
1748
1776
  readonly kind = "cost";
@@ -1814,7 +1842,7 @@ interface LatencyEvaluatorOptions {
1814
1842
  }
1815
1843
  /**
1816
1844
  * Evaluator that checks execution duration against a threshold.
1817
- * Uses trace.durationMs from the evaluation context.
1845
+ * Uses durationMs from the evaluation context.
1818
1846
  */
1819
1847
  declare class LatencyEvaluator implements Evaluator {
1820
1848
  readonly kind = "latency";
@@ -1989,7 +2017,7 @@ interface TokenUsageEvaluatorOptions {
1989
2017
  }
1990
2018
  /**
1991
2019
  * Evaluator that checks provider-reported token usage against configured limits.
1992
- * Uses trace.tokenUsage from the evaluation context.
2020
+ * Uses tokenUsage from the evaluation context.
1993
2021
  */
1994
2022
  declare class TokenUsageEvaluator implements Evaluator {
1995
2023
  readonly kind = "token_usage";
@@ -2890,4 +2918,4 @@ type AgentKernel = {
2890
2918
  };
2891
2919
  declare function createAgentKernel(): AgentKernel;
2892
2920
 
2893
- export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
2921
+ export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };