@agentv/core 2.8.0-next.1 → 2.9.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-P2465XAH.js → chunk-7Q4PH265.js} +1 -1
- package/dist/chunk-7Q4PH265.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +163 -72
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +48 -20
- package/dist/index.d.ts +48 -20
- package/dist/index.js +164 -73
- package/dist/index.js.map +1 -1
- package/package.json +6 -4
- package/dist/chunk-P2465XAH.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -239,20 +239,22 @@ interface TraceSummary {
|
|
|
239
239
|
readonly toolCallsByName: Readonly<Record<string, number>>;
|
|
240
240
|
/** Number of error events */
|
|
241
241
|
readonly errorCount: number;
|
|
242
|
-
/**
|
|
242
|
+
/** Per-tool duration arrays in milliseconds (optional) */
|
|
243
|
+
readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
|
|
244
|
+
/** Number of LLM calls (assistant messages) */
|
|
245
|
+
readonly llmCallCount?: number;
|
|
246
|
+
}
|
|
247
|
+
/**
|
|
248
|
+
* Combined result of trace computation + execution metrics merge.
|
|
249
|
+
* Returned by computeTraceSummaryWithMetrics().
|
|
250
|
+
*/
|
|
251
|
+
interface TraceComputeResult {
|
|
252
|
+
readonly trace: TraceSummary;
|
|
243
253
|
readonly tokenUsage?: TokenUsage;
|
|
244
|
-
/** Total cost in USD (optional, from provider) */
|
|
245
254
|
readonly costUsd?: number;
|
|
246
|
-
/** Total execution duration in milliseconds (optional) */
|
|
247
255
|
readonly durationMs?: number;
|
|
248
|
-
/** Per-tool duration arrays in milliseconds (optional) */
|
|
249
|
-
readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
|
|
250
|
-
/** ISO 8601 timestamp when execution started (derived from earliest span) */
|
|
251
256
|
readonly startTime?: string;
|
|
252
|
-
/** ISO 8601 timestamp when execution ended (derived from latest span) */
|
|
253
257
|
readonly endTime?: string;
|
|
254
|
-
/** Number of LLM calls (assistant messages) */
|
|
255
|
-
readonly llmCallCount?: number;
|
|
256
258
|
}
|
|
257
259
|
/**
|
|
258
260
|
* Argument matching mode for tool_trajectory expected items.
|
|
@@ -319,7 +321,7 @@ interface MessageLike {
|
|
|
319
321
|
* - toolDurations: per-tool duration arrays (from durationMs or computed from start/end)
|
|
320
322
|
* - llmCallCount: count of assistant messages
|
|
321
323
|
*/
|
|
322
|
-
declare function computeTraceSummary(messages: readonly MessageLike[]):
|
|
324
|
+
declare function computeTraceSummary(messages: readonly MessageLike[]): TraceComputeResult;
|
|
323
325
|
/**
|
|
324
326
|
* Default tool names considered as exploration/read-only operations.
|
|
325
327
|
* Can be overridden per-evaluation via config.
|
|
@@ -341,7 +343,7 @@ declare function explorationRatio(summary: TraceSummary, explorationTools?: read
|
|
|
341
343
|
* @param summary - Trace summary with optional token usage
|
|
342
344
|
* @returns Average tokens per tool call, or undefined
|
|
343
345
|
*/
|
|
344
|
-
declare function tokensPerTool(summary: TraceSummary): number | undefined;
|
|
346
|
+
declare function tokensPerTool(summary: TraceSummary, tokenUsage?: TokenUsage): number | undefined;
|
|
345
347
|
/**
|
|
346
348
|
* Average tool duration across all tool calls.
|
|
347
349
|
* Returns undefined if toolDurations is not available or empty.
|
|
@@ -363,15 +365,15 @@ interface ExecutionMetrics {
|
|
|
363
365
|
readonly endTime?: string;
|
|
364
366
|
}
|
|
365
367
|
/**
|
|
366
|
-
* Merge execution metrics from provider response into a trace
|
|
367
|
-
* Returns a new
|
|
368
|
+
* Merge execution metrics from provider response into a trace compute result.
|
|
369
|
+
* Returns a new TraceComputeResult with metrics fields populated.
|
|
368
370
|
* Provider-level timing takes precedence over span-derived timing.
|
|
369
371
|
*
|
|
370
|
-
* @param
|
|
372
|
+
* @param computed - Base trace compute result from computeTraceSummary
|
|
371
373
|
* @param metrics - Optional execution metrics from provider
|
|
372
|
-
* @returns
|
|
374
|
+
* @returns TraceComputeResult with merged metrics
|
|
373
375
|
*/
|
|
374
|
-
declare function mergeExecutionMetrics(
|
|
376
|
+
declare function mergeExecutionMetrics(computed: TraceComputeResult, metrics?: ExecutionMetrics): TraceComputeResult;
|
|
375
377
|
|
|
376
378
|
/**
|
|
377
379
|
* JSON primitive values appearing in AgentV payloads.
|
|
@@ -922,6 +924,16 @@ interface EvaluationResult {
|
|
|
922
924
|
readonly answer: string;
|
|
923
925
|
readonly target: string;
|
|
924
926
|
readonly reasoning?: string;
|
|
927
|
+
/** Token usage metrics from provider (optional) */
|
|
928
|
+
readonly tokenUsage?: TokenUsage;
|
|
929
|
+
/** Total cost in USD (optional, from provider) */
|
|
930
|
+
readonly costUsd?: number;
|
|
931
|
+
/** Total execution duration in milliseconds (optional) */
|
|
932
|
+
readonly durationMs?: number;
|
|
933
|
+
/** ISO 8601 timestamp when execution started */
|
|
934
|
+
readonly startTime?: string;
|
|
935
|
+
/** ISO 8601 timestamp when execution ended */
|
|
936
|
+
readonly endTime?: string;
|
|
925
937
|
readonly requests?: {
|
|
926
938
|
readonly agent?: JsonObject;
|
|
927
939
|
readonly lm?: JsonObject;
|
|
@@ -971,6 +983,8 @@ interface EvaluatorResult {
|
|
|
971
983
|
readonly scores?: readonly EvaluatorResult[];
|
|
972
984
|
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
|
|
973
985
|
readonly details?: JsonObject;
|
|
986
|
+
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
987
|
+
readonly tokenUsage?: TokenUsage;
|
|
974
988
|
}
|
|
975
989
|
/**
|
|
976
990
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -1639,6 +1653,16 @@ interface EvaluationContext {
|
|
|
1639
1653
|
readonly output?: readonly Message[];
|
|
1640
1654
|
/** Lightweight summary of trace events (if available) */
|
|
1641
1655
|
readonly trace?: TraceSummary;
|
|
1656
|
+
/** Token usage from provider execution (promoted from TraceSummary) */
|
|
1657
|
+
readonly tokenUsage?: TokenUsage;
|
|
1658
|
+
/** Total cost in USD (from provider) */
|
|
1659
|
+
readonly costUsd?: number;
|
|
1660
|
+
/** Execution duration in milliseconds */
|
|
1661
|
+
readonly durationMs?: number;
|
|
1662
|
+
/** ISO 8601 timestamp when execution started */
|
|
1663
|
+
readonly startTime?: string;
|
|
1664
|
+
/** ISO 8601 timestamp when execution ended */
|
|
1665
|
+
readonly endTime?: string;
|
|
1642
1666
|
/** Resolver for target override in code judges */
|
|
1643
1667
|
readonly targetResolver?: TargetResolver;
|
|
1644
1668
|
/** List of available target names for code judges */
|
|
@@ -1659,6 +1683,8 @@ interface EvaluationScore {
|
|
|
1659
1683
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
1660
1684
|
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1661
1685
|
readonly details?: JsonObject;
|
|
1686
|
+
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1687
|
+
readonly tokenUsage?: TokenUsage;
|
|
1662
1688
|
}
|
|
1663
1689
|
interface ChildEvaluatorResult {
|
|
1664
1690
|
readonly name: string;
|
|
@@ -1673,6 +1699,8 @@ interface ChildEvaluatorResult {
|
|
|
1673
1699
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
1674
1700
|
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1675
1701
|
readonly details?: JsonObject;
|
|
1702
|
+
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1703
|
+
readonly tokenUsage?: TokenUsage;
|
|
1676
1704
|
}
|
|
1677
1705
|
interface Evaluator {
|
|
1678
1706
|
readonly kind: string;
|
|
@@ -1742,7 +1770,7 @@ interface CostEvaluatorOptions {
|
|
|
1742
1770
|
}
|
|
1743
1771
|
/**
|
|
1744
1772
|
* Evaluator that checks execution cost against a budget.
|
|
1745
|
-
* Uses
|
|
1773
|
+
* Uses costUsd from the evaluation context.
|
|
1746
1774
|
*/
|
|
1747
1775
|
declare class CostEvaluator implements Evaluator {
|
|
1748
1776
|
readonly kind = "cost";
|
|
@@ -1814,7 +1842,7 @@ interface LatencyEvaluatorOptions {
|
|
|
1814
1842
|
}
|
|
1815
1843
|
/**
|
|
1816
1844
|
* Evaluator that checks execution duration against a threshold.
|
|
1817
|
-
* Uses
|
|
1845
|
+
* Uses durationMs from the evaluation context.
|
|
1818
1846
|
*/
|
|
1819
1847
|
declare class LatencyEvaluator implements Evaluator {
|
|
1820
1848
|
readonly kind = "latency";
|
|
@@ -1989,7 +2017,7 @@ interface TokenUsageEvaluatorOptions {
|
|
|
1989
2017
|
}
|
|
1990
2018
|
/**
|
|
1991
2019
|
* Evaluator that checks provider-reported token usage against configured limits.
|
|
1992
|
-
* Uses
|
|
2020
|
+
* Uses tokenUsage from the evaluation context.
|
|
1993
2021
|
*/
|
|
1994
2022
|
declare class TokenUsageEvaluator implements Evaluator {
|
|
1995
2023
|
readonly kind = "token_usage";
|
|
@@ -2890,4 +2918,4 @@ type AgentKernel = {
|
|
|
2890
2918
|
};
|
|
2891
2919
|
declare function createAgentKernel(): AgentKernel;
|
|
2892
2920
|
|
|
2893
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
2921
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -239,20 +239,22 @@ interface TraceSummary {
|
|
|
239
239
|
readonly toolCallsByName: Readonly<Record<string, number>>;
|
|
240
240
|
/** Number of error events */
|
|
241
241
|
readonly errorCount: number;
|
|
242
|
-
/**
|
|
242
|
+
/** Per-tool duration arrays in milliseconds (optional) */
|
|
243
|
+
readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
|
|
244
|
+
/** Number of LLM calls (assistant messages) */
|
|
245
|
+
readonly llmCallCount?: number;
|
|
246
|
+
}
|
|
247
|
+
/**
|
|
248
|
+
* Combined result of trace computation + execution metrics merge.
|
|
249
|
+
* Returned by computeTraceSummaryWithMetrics().
|
|
250
|
+
*/
|
|
251
|
+
interface TraceComputeResult {
|
|
252
|
+
readonly trace: TraceSummary;
|
|
243
253
|
readonly tokenUsage?: TokenUsage;
|
|
244
|
-
/** Total cost in USD (optional, from provider) */
|
|
245
254
|
readonly costUsd?: number;
|
|
246
|
-
/** Total execution duration in milliseconds (optional) */
|
|
247
255
|
readonly durationMs?: number;
|
|
248
|
-
/** Per-tool duration arrays in milliseconds (optional) */
|
|
249
|
-
readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
|
|
250
|
-
/** ISO 8601 timestamp when execution started (derived from earliest span) */
|
|
251
256
|
readonly startTime?: string;
|
|
252
|
-
/** ISO 8601 timestamp when execution ended (derived from latest span) */
|
|
253
257
|
readonly endTime?: string;
|
|
254
|
-
/** Number of LLM calls (assistant messages) */
|
|
255
|
-
readonly llmCallCount?: number;
|
|
256
258
|
}
|
|
257
259
|
/**
|
|
258
260
|
* Argument matching mode for tool_trajectory expected items.
|
|
@@ -319,7 +321,7 @@ interface MessageLike {
|
|
|
319
321
|
* - toolDurations: per-tool duration arrays (from durationMs or computed from start/end)
|
|
320
322
|
* - llmCallCount: count of assistant messages
|
|
321
323
|
*/
|
|
322
|
-
declare function computeTraceSummary(messages: readonly MessageLike[]):
|
|
324
|
+
declare function computeTraceSummary(messages: readonly MessageLike[]): TraceComputeResult;
|
|
323
325
|
/**
|
|
324
326
|
* Default tool names considered as exploration/read-only operations.
|
|
325
327
|
* Can be overridden per-evaluation via config.
|
|
@@ -341,7 +343,7 @@ declare function explorationRatio(summary: TraceSummary, explorationTools?: read
|
|
|
341
343
|
* @param summary - Trace summary with optional token usage
|
|
342
344
|
* @returns Average tokens per tool call, or undefined
|
|
343
345
|
*/
|
|
344
|
-
declare function tokensPerTool(summary: TraceSummary): number | undefined;
|
|
346
|
+
declare function tokensPerTool(summary: TraceSummary, tokenUsage?: TokenUsage): number | undefined;
|
|
345
347
|
/**
|
|
346
348
|
* Average tool duration across all tool calls.
|
|
347
349
|
* Returns undefined if toolDurations is not available or empty.
|
|
@@ -363,15 +365,15 @@ interface ExecutionMetrics {
|
|
|
363
365
|
readonly endTime?: string;
|
|
364
366
|
}
|
|
365
367
|
/**
|
|
366
|
-
* Merge execution metrics from provider response into a trace
|
|
367
|
-
* Returns a new
|
|
368
|
+
* Merge execution metrics from provider response into a trace compute result.
|
|
369
|
+
* Returns a new TraceComputeResult with metrics fields populated.
|
|
368
370
|
* Provider-level timing takes precedence over span-derived timing.
|
|
369
371
|
*
|
|
370
|
-
* @param
|
|
372
|
+
* @param computed - Base trace compute result from computeTraceSummary
|
|
371
373
|
* @param metrics - Optional execution metrics from provider
|
|
372
|
-
* @returns
|
|
374
|
+
* @returns TraceComputeResult with merged metrics
|
|
373
375
|
*/
|
|
374
|
-
declare function mergeExecutionMetrics(
|
|
376
|
+
declare function mergeExecutionMetrics(computed: TraceComputeResult, metrics?: ExecutionMetrics): TraceComputeResult;
|
|
375
377
|
|
|
376
378
|
/**
|
|
377
379
|
* JSON primitive values appearing in AgentV payloads.
|
|
@@ -922,6 +924,16 @@ interface EvaluationResult {
|
|
|
922
924
|
readonly answer: string;
|
|
923
925
|
readonly target: string;
|
|
924
926
|
readonly reasoning?: string;
|
|
927
|
+
/** Token usage metrics from provider (optional) */
|
|
928
|
+
readonly tokenUsage?: TokenUsage;
|
|
929
|
+
/** Total cost in USD (optional, from provider) */
|
|
930
|
+
readonly costUsd?: number;
|
|
931
|
+
/** Total execution duration in milliseconds (optional) */
|
|
932
|
+
readonly durationMs?: number;
|
|
933
|
+
/** ISO 8601 timestamp when execution started */
|
|
934
|
+
readonly startTime?: string;
|
|
935
|
+
/** ISO 8601 timestamp when execution ended */
|
|
936
|
+
readonly endTime?: string;
|
|
925
937
|
readonly requests?: {
|
|
926
938
|
readonly agent?: JsonObject;
|
|
927
939
|
readonly lm?: JsonObject;
|
|
@@ -971,6 +983,8 @@ interface EvaluatorResult {
|
|
|
971
983
|
readonly scores?: readonly EvaluatorResult[];
|
|
972
984
|
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
|
|
973
985
|
readonly details?: JsonObject;
|
|
986
|
+
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
987
|
+
readonly tokenUsage?: TokenUsage;
|
|
974
988
|
}
|
|
975
989
|
/**
|
|
976
990
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -1639,6 +1653,16 @@ interface EvaluationContext {
|
|
|
1639
1653
|
readonly output?: readonly Message[];
|
|
1640
1654
|
/** Lightweight summary of trace events (if available) */
|
|
1641
1655
|
readonly trace?: TraceSummary;
|
|
1656
|
+
/** Token usage from provider execution (promoted from TraceSummary) */
|
|
1657
|
+
readonly tokenUsage?: TokenUsage;
|
|
1658
|
+
/** Total cost in USD (from provider) */
|
|
1659
|
+
readonly costUsd?: number;
|
|
1660
|
+
/** Execution duration in milliseconds */
|
|
1661
|
+
readonly durationMs?: number;
|
|
1662
|
+
/** ISO 8601 timestamp when execution started */
|
|
1663
|
+
readonly startTime?: string;
|
|
1664
|
+
/** ISO 8601 timestamp when execution ended */
|
|
1665
|
+
readonly endTime?: string;
|
|
1642
1666
|
/** Resolver for target override in code judges */
|
|
1643
1667
|
readonly targetResolver?: TargetResolver;
|
|
1644
1668
|
/** List of available target names for code judges */
|
|
@@ -1659,6 +1683,8 @@ interface EvaluationScore {
|
|
|
1659
1683
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
1660
1684
|
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1661
1685
|
readonly details?: JsonObject;
|
|
1686
|
+
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1687
|
+
readonly tokenUsage?: TokenUsage;
|
|
1662
1688
|
}
|
|
1663
1689
|
interface ChildEvaluatorResult {
|
|
1664
1690
|
readonly name: string;
|
|
@@ -1673,6 +1699,8 @@ interface ChildEvaluatorResult {
|
|
|
1673
1699
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
1674
1700
|
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1675
1701
|
readonly details?: JsonObject;
|
|
1702
|
+
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1703
|
+
readonly tokenUsage?: TokenUsage;
|
|
1676
1704
|
}
|
|
1677
1705
|
interface Evaluator {
|
|
1678
1706
|
readonly kind: string;
|
|
@@ -1742,7 +1770,7 @@ interface CostEvaluatorOptions {
|
|
|
1742
1770
|
}
|
|
1743
1771
|
/**
|
|
1744
1772
|
* Evaluator that checks execution cost against a budget.
|
|
1745
|
-
* Uses
|
|
1773
|
+
* Uses costUsd from the evaluation context.
|
|
1746
1774
|
*/
|
|
1747
1775
|
declare class CostEvaluator implements Evaluator {
|
|
1748
1776
|
readonly kind = "cost";
|
|
@@ -1814,7 +1842,7 @@ interface LatencyEvaluatorOptions {
|
|
|
1814
1842
|
}
|
|
1815
1843
|
/**
|
|
1816
1844
|
* Evaluator that checks execution duration against a threshold.
|
|
1817
|
-
* Uses
|
|
1845
|
+
* Uses durationMs from the evaluation context.
|
|
1818
1846
|
*/
|
|
1819
1847
|
declare class LatencyEvaluator implements Evaluator {
|
|
1820
1848
|
readonly kind = "latency";
|
|
@@ -1989,7 +2017,7 @@ interface TokenUsageEvaluatorOptions {
|
|
|
1989
2017
|
}
|
|
1990
2018
|
/**
|
|
1991
2019
|
* Evaluator that checks provider-reported token usage against configured limits.
|
|
1992
|
-
* Uses
|
|
2020
|
+
* Uses tokenUsage from the evaluation context.
|
|
1993
2021
|
*/
|
|
1994
2022
|
declare class TokenUsageEvaluator implements Evaluator {
|
|
1995
2023
|
readonly kind = "token_usage";
|
|
@@ -2890,4 +2918,4 @@ type AgentKernel = {
|
|
|
2890
2918
|
};
|
|
2891
2919
|
declare function createAgentKernel(): AgentKernel;
|
|
2892
2920
|
|
|
2893
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
2921
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|