@agentv/core 2.7.1-next.6 → 2.9.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-5SV2QC6V.js → chunk-7Q4PH265.js} +6 -18
- package/dist/chunk-7Q4PH265.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +4 -11
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +2 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +234 -89
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +54 -22
- package/dist/index.d.ts +54 -22
- package/dist/index.js +230 -73
- package/dist/index.js.map +1 -1
- package/package.json +6 -4
- package/dist/chunk-5SV2QC6V.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -187,8 +187,6 @@ interface TargetDefinition {
|
|
|
187
187
|
readonly subagentRoot?: string | unknown | undefined;
|
|
188
188
|
readonly workspace_template?: string | unknown | undefined;
|
|
189
189
|
readonly workspaceTemplate?: string | unknown | undefined;
|
|
190
|
-
readonly command_template?: string | unknown | undefined;
|
|
191
|
-
readonly commandTemplate?: string | unknown | undefined;
|
|
192
190
|
readonly files_format?: string | unknown | undefined;
|
|
193
191
|
readonly filesFormat?: string | unknown | undefined;
|
|
194
192
|
readonly attachments_format?: string | unknown | undefined;
|
|
@@ -241,20 +239,22 @@ interface TraceSummary {
|
|
|
241
239
|
readonly toolCallsByName: Readonly<Record<string, number>>;
|
|
242
240
|
/** Number of error events */
|
|
243
241
|
readonly errorCount: number;
|
|
244
|
-
/**
|
|
242
|
+
/** Per-tool duration arrays in milliseconds (optional) */
|
|
243
|
+
readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
|
|
244
|
+
/** Number of LLM calls (assistant messages) */
|
|
245
|
+
readonly llmCallCount?: number;
|
|
246
|
+
}
|
|
247
|
+
/**
|
|
248
|
+
* Combined result of trace computation + execution metrics merge.
|
|
249
|
+
* Returned by computeTraceSummaryWithMetrics().
|
|
250
|
+
*/
|
|
251
|
+
interface TraceComputeResult {
|
|
252
|
+
readonly trace: TraceSummary;
|
|
245
253
|
readonly tokenUsage?: TokenUsage;
|
|
246
|
-
/** Total cost in USD (optional, from provider) */
|
|
247
254
|
readonly costUsd?: number;
|
|
248
|
-
/** Total execution duration in milliseconds (optional) */
|
|
249
255
|
readonly durationMs?: number;
|
|
250
|
-
/** Per-tool duration arrays in milliseconds (optional) */
|
|
251
|
-
readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
|
|
252
|
-
/** ISO 8601 timestamp when execution started (derived from earliest span) */
|
|
253
256
|
readonly startTime?: string;
|
|
254
|
-
/** ISO 8601 timestamp when execution ended (derived from latest span) */
|
|
255
257
|
readonly endTime?: string;
|
|
256
|
-
/** Number of LLM calls (assistant messages) */
|
|
257
|
-
readonly llmCallCount?: number;
|
|
258
258
|
}
|
|
259
259
|
/**
|
|
260
260
|
* Argument matching mode for tool_trajectory expected items.
|
|
@@ -321,7 +321,7 @@ interface MessageLike {
|
|
|
321
321
|
* - toolDurations: per-tool duration arrays (from durationMs or computed from start/end)
|
|
322
322
|
* - llmCallCount: count of assistant messages
|
|
323
323
|
*/
|
|
324
|
-
declare function computeTraceSummary(messages: readonly MessageLike[]):
|
|
324
|
+
declare function computeTraceSummary(messages: readonly MessageLike[]): TraceComputeResult;
|
|
325
325
|
/**
|
|
326
326
|
* Default tool names considered as exploration/read-only operations.
|
|
327
327
|
* Can be overridden per-evaluation via config.
|
|
@@ -343,7 +343,7 @@ declare function explorationRatio(summary: TraceSummary, explorationTools?: read
|
|
|
343
343
|
* @param summary - Trace summary with optional token usage
|
|
344
344
|
* @returns Average tokens per tool call, or undefined
|
|
345
345
|
*/
|
|
346
|
-
declare function tokensPerTool(summary: TraceSummary): number | undefined;
|
|
346
|
+
declare function tokensPerTool(summary: TraceSummary, tokenUsage?: TokenUsage): number | undefined;
|
|
347
347
|
/**
|
|
348
348
|
* Average tool duration across all tool calls.
|
|
349
349
|
* Returns undefined if toolDurations is not available or empty.
|
|
@@ -365,15 +365,15 @@ interface ExecutionMetrics {
|
|
|
365
365
|
readonly endTime?: string;
|
|
366
366
|
}
|
|
367
367
|
/**
|
|
368
|
-
* Merge execution metrics from provider response into a trace
|
|
369
|
-
* Returns a new
|
|
368
|
+
* Merge execution metrics from provider response into a trace compute result.
|
|
369
|
+
* Returns a new TraceComputeResult with metrics fields populated.
|
|
370
370
|
* Provider-level timing takes precedence over span-derived timing.
|
|
371
371
|
*
|
|
372
|
-
* @param
|
|
372
|
+
* @param computed - Base trace compute result from computeTraceSummary
|
|
373
373
|
* @param metrics - Optional execution metrics from provider
|
|
374
|
-
* @returns
|
|
374
|
+
* @returns TraceComputeResult with merged metrics
|
|
375
375
|
*/
|
|
376
|
-
declare function mergeExecutionMetrics(
|
|
376
|
+
declare function mergeExecutionMetrics(computed: TraceComputeResult, metrics?: ExecutionMetrics): TraceComputeResult;
|
|
377
377
|
|
|
378
378
|
/**
|
|
379
379
|
* JSON primitive values appearing in AgentV payloads.
|
|
@@ -924,6 +924,16 @@ interface EvaluationResult {
|
|
|
924
924
|
readonly answer: string;
|
|
925
925
|
readonly target: string;
|
|
926
926
|
readonly reasoning?: string;
|
|
927
|
+
/** Token usage metrics from provider (optional) */
|
|
928
|
+
readonly tokenUsage?: TokenUsage;
|
|
929
|
+
/** Total cost in USD (optional, from provider) */
|
|
930
|
+
readonly costUsd?: number;
|
|
931
|
+
/** Total execution duration in milliseconds (optional) */
|
|
932
|
+
readonly durationMs?: number;
|
|
933
|
+
/** ISO 8601 timestamp when execution started */
|
|
934
|
+
readonly startTime?: string;
|
|
935
|
+
/** ISO 8601 timestamp when execution ended */
|
|
936
|
+
readonly endTime?: string;
|
|
927
937
|
readonly requests?: {
|
|
928
938
|
readonly agent?: JsonObject;
|
|
929
939
|
readonly lm?: JsonObject;
|
|
@@ -955,6 +965,8 @@ interface EvaluationResult {
|
|
|
955
965
|
readonly aggregation?: TrialAggregation;
|
|
956
966
|
/** Whether the trial loop was terminated early due to cost limit */
|
|
957
967
|
readonly costLimited?: boolean;
|
|
968
|
+
/** Whether the evaluation was skipped due to suite-level budget exhaustion */
|
|
969
|
+
readonly budgetExceeded?: boolean;
|
|
958
970
|
}
|
|
959
971
|
type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
|
|
960
972
|
interface EvaluatorResult {
|
|
@@ -971,6 +983,8 @@ interface EvaluatorResult {
|
|
|
971
983
|
readonly scores?: readonly EvaluatorResult[];
|
|
972
984
|
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
|
|
973
985
|
readonly details?: JsonObject;
|
|
986
|
+
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
987
|
+
readonly tokenUsage?: TokenUsage;
|
|
974
988
|
}
|
|
975
989
|
/**
|
|
976
990
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -1116,6 +1130,8 @@ type EvalSuiteResult = {
|
|
|
1116
1130
|
readonly cacheConfig?: CacheConfig;
|
|
1117
1131
|
/** Suite-level metadata (name, description, version, etc.) */
|
|
1118
1132
|
readonly metadata?: EvalMetadata;
|
|
1133
|
+
/** Suite-level total cost budget in USD */
|
|
1134
|
+
readonly totalBudgetUsd?: number;
|
|
1119
1135
|
};
|
|
1120
1136
|
/**
|
|
1121
1137
|
* Load tests and suite metadata from a single parse.
|
|
@@ -1637,6 +1653,16 @@ interface EvaluationContext {
|
|
|
1637
1653
|
readonly output?: readonly Message[];
|
|
1638
1654
|
/** Lightweight summary of trace events (if available) */
|
|
1639
1655
|
readonly trace?: TraceSummary;
|
|
1656
|
+
/** Token usage from provider execution (promoted from TraceSummary) */
|
|
1657
|
+
readonly tokenUsage?: TokenUsage;
|
|
1658
|
+
/** Total cost in USD (from provider) */
|
|
1659
|
+
readonly costUsd?: number;
|
|
1660
|
+
/** Execution duration in milliseconds */
|
|
1661
|
+
readonly durationMs?: number;
|
|
1662
|
+
/** ISO 8601 timestamp when execution started */
|
|
1663
|
+
readonly startTime?: string;
|
|
1664
|
+
/** ISO 8601 timestamp when execution ended */
|
|
1665
|
+
readonly endTime?: string;
|
|
1640
1666
|
/** Resolver for target override in code judges */
|
|
1641
1667
|
readonly targetResolver?: TargetResolver;
|
|
1642
1668
|
/** List of available target names for code judges */
|
|
@@ -1657,6 +1683,8 @@ interface EvaluationScore {
|
|
|
1657
1683
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
1658
1684
|
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1659
1685
|
readonly details?: JsonObject;
|
|
1686
|
+
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1687
|
+
readonly tokenUsage?: TokenUsage;
|
|
1660
1688
|
}
|
|
1661
1689
|
interface ChildEvaluatorResult {
|
|
1662
1690
|
readonly name: string;
|
|
@@ -1671,6 +1699,8 @@ interface ChildEvaluatorResult {
|
|
|
1671
1699
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
1672
1700
|
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1673
1701
|
readonly details?: JsonObject;
|
|
1702
|
+
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1703
|
+
readonly tokenUsage?: TokenUsage;
|
|
1674
1704
|
}
|
|
1675
1705
|
interface Evaluator {
|
|
1676
1706
|
readonly kind: string;
|
|
@@ -1740,7 +1770,7 @@ interface CostEvaluatorOptions {
|
|
|
1740
1770
|
}
|
|
1741
1771
|
/**
|
|
1742
1772
|
* Evaluator that checks execution cost against a budget.
|
|
1743
|
-
* Uses
|
|
1773
|
+
* Uses costUsd from the evaluation context.
|
|
1744
1774
|
*/
|
|
1745
1775
|
declare class CostEvaluator implements Evaluator {
|
|
1746
1776
|
readonly kind = "cost";
|
|
@@ -1812,7 +1842,7 @@ interface LatencyEvaluatorOptions {
|
|
|
1812
1842
|
}
|
|
1813
1843
|
/**
|
|
1814
1844
|
* Evaluator that checks execution duration against a threshold.
|
|
1815
|
-
* Uses
|
|
1845
|
+
* Uses durationMs from the evaluation context.
|
|
1816
1846
|
*/
|
|
1817
1847
|
declare class LatencyEvaluator implements Evaluator {
|
|
1818
1848
|
readonly kind = "latency";
|
|
@@ -1987,7 +2017,7 @@ interface TokenUsageEvaluatorOptions {
|
|
|
1987
2017
|
}
|
|
1988
2018
|
/**
|
|
1989
2019
|
* Evaluator that checks provider-reported token usage against configured limits.
|
|
1990
|
-
* Uses
|
|
2020
|
+
* Uses tokenUsage from the evaluation context.
|
|
1991
2021
|
*/
|
|
1992
2022
|
declare class TokenUsageEvaluator implements Evaluator {
|
|
1993
2023
|
readonly kind = "token_usage";
|
|
@@ -2196,6 +2226,8 @@ interface RunEvaluationOptions {
|
|
|
2196
2226
|
readonly trials?: TrialsConfig;
|
|
2197
2227
|
/** Real-time observability callbacks passed to the provider */
|
|
2198
2228
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
2229
|
+
/** Suite-level total cost budget in USD (stops dispatching when exceeded) */
|
|
2230
|
+
readonly totalBudgetUsd?: number;
|
|
2199
2231
|
}
|
|
2200
2232
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
2201
2233
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
@@ -2886,4 +2918,4 @@ type AgentKernel = {
|
|
|
2886
2918
|
};
|
|
2887
2919
|
declare function createAgentKernel(): AgentKernel;
|
|
2888
2920
|
|
|
2889
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
2921
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -187,8 +187,6 @@ interface TargetDefinition {
|
|
|
187
187
|
readonly subagentRoot?: string | unknown | undefined;
|
|
188
188
|
readonly workspace_template?: string | unknown | undefined;
|
|
189
189
|
readonly workspaceTemplate?: string | unknown | undefined;
|
|
190
|
-
readonly command_template?: string | unknown | undefined;
|
|
191
|
-
readonly commandTemplate?: string | unknown | undefined;
|
|
192
190
|
readonly files_format?: string | unknown | undefined;
|
|
193
191
|
readonly filesFormat?: string | unknown | undefined;
|
|
194
192
|
readonly attachments_format?: string | unknown | undefined;
|
|
@@ -241,20 +239,22 @@ interface TraceSummary {
|
|
|
241
239
|
readonly toolCallsByName: Readonly<Record<string, number>>;
|
|
242
240
|
/** Number of error events */
|
|
243
241
|
readonly errorCount: number;
|
|
244
|
-
/**
|
|
242
|
+
/** Per-tool duration arrays in milliseconds (optional) */
|
|
243
|
+
readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
|
|
244
|
+
/** Number of LLM calls (assistant messages) */
|
|
245
|
+
readonly llmCallCount?: number;
|
|
246
|
+
}
|
|
247
|
+
/**
|
|
248
|
+
* Combined result of trace computation + execution metrics merge.
|
|
249
|
+
* Returned by computeTraceSummaryWithMetrics().
|
|
250
|
+
*/
|
|
251
|
+
interface TraceComputeResult {
|
|
252
|
+
readonly trace: TraceSummary;
|
|
245
253
|
readonly tokenUsage?: TokenUsage;
|
|
246
|
-
/** Total cost in USD (optional, from provider) */
|
|
247
254
|
readonly costUsd?: number;
|
|
248
|
-
/** Total execution duration in milliseconds (optional) */
|
|
249
255
|
readonly durationMs?: number;
|
|
250
|
-
/** Per-tool duration arrays in milliseconds (optional) */
|
|
251
|
-
readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
|
|
252
|
-
/** ISO 8601 timestamp when execution started (derived from earliest span) */
|
|
253
256
|
readonly startTime?: string;
|
|
254
|
-
/** ISO 8601 timestamp when execution ended (derived from latest span) */
|
|
255
257
|
readonly endTime?: string;
|
|
256
|
-
/** Number of LLM calls (assistant messages) */
|
|
257
|
-
readonly llmCallCount?: number;
|
|
258
258
|
}
|
|
259
259
|
/**
|
|
260
260
|
* Argument matching mode for tool_trajectory expected items.
|
|
@@ -321,7 +321,7 @@ interface MessageLike {
|
|
|
321
321
|
* - toolDurations: per-tool duration arrays (from durationMs or computed from start/end)
|
|
322
322
|
* - llmCallCount: count of assistant messages
|
|
323
323
|
*/
|
|
324
|
-
declare function computeTraceSummary(messages: readonly MessageLike[]):
|
|
324
|
+
declare function computeTraceSummary(messages: readonly MessageLike[]): TraceComputeResult;
|
|
325
325
|
/**
|
|
326
326
|
* Default tool names considered as exploration/read-only operations.
|
|
327
327
|
* Can be overridden per-evaluation via config.
|
|
@@ -343,7 +343,7 @@ declare function explorationRatio(summary: TraceSummary, explorationTools?: read
|
|
|
343
343
|
* @param summary - Trace summary with optional token usage
|
|
344
344
|
* @returns Average tokens per tool call, or undefined
|
|
345
345
|
*/
|
|
346
|
-
declare function tokensPerTool(summary: TraceSummary): number | undefined;
|
|
346
|
+
declare function tokensPerTool(summary: TraceSummary, tokenUsage?: TokenUsage): number | undefined;
|
|
347
347
|
/**
|
|
348
348
|
* Average tool duration across all tool calls.
|
|
349
349
|
* Returns undefined if toolDurations is not available or empty.
|
|
@@ -365,15 +365,15 @@ interface ExecutionMetrics {
|
|
|
365
365
|
readonly endTime?: string;
|
|
366
366
|
}
|
|
367
367
|
/**
|
|
368
|
-
* Merge execution metrics from provider response into a trace
|
|
369
|
-
* Returns a new
|
|
368
|
+
* Merge execution metrics from provider response into a trace compute result.
|
|
369
|
+
* Returns a new TraceComputeResult with metrics fields populated.
|
|
370
370
|
* Provider-level timing takes precedence over span-derived timing.
|
|
371
371
|
*
|
|
372
|
-
* @param
|
|
372
|
+
* @param computed - Base trace compute result from computeTraceSummary
|
|
373
373
|
* @param metrics - Optional execution metrics from provider
|
|
374
|
-
* @returns
|
|
374
|
+
* @returns TraceComputeResult with merged metrics
|
|
375
375
|
*/
|
|
376
|
-
declare function mergeExecutionMetrics(
|
|
376
|
+
declare function mergeExecutionMetrics(computed: TraceComputeResult, metrics?: ExecutionMetrics): TraceComputeResult;
|
|
377
377
|
|
|
378
378
|
/**
|
|
379
379
|
* JSON primitive values appearing in AgentV payloads.
|
|
@@ -924,6 +924,16 @@ interface EvaluationResult {
|
|
|
924
924
|
readonly answer: string;
|
|
925
925
|
readonly target: string;
|
|
926
926
|
readonly reasoning?: string;
|
|
927
|
+
/** Token usage metrics from provider (optional) */
|
|
928
|
+
readonly tokenUsage?: TokenUsage;
|
|
929
|
+
/** Total cost in USD (optional, from provider) */
|
|
930
|
+
readonly costUsd?: number;
|
|
931
|
+
/** Total execution duration in milliseconds (optional) */
|
|
932
|
+
readonly durationMs?: number;
|
|
933
|
+
/** ISO 8601 timestamp when execution started */
|
|
934
|
+
readonly startTime?: string;
|
|
935
|
+
/** ISO 8601 timestamp when execution ended */
|
|
936
|
+
readonly endTime?: string;
|
|
927
937
|
readonly requests?: {
|
|
928
938
|
readonly agent?: JsonObject;
|
|
929
939
|
readonly lm?: JsonObject;
|
|
@@ -955,6 +965,8 @@ interface EvaluationResult {
|
|
|
955
965
|
readonly aggregation?: TrialAggregation;
|
|
956
966
|
/** Whether the trial loop was terminated early due to cost limit */
|
|
957
967
|
readonly costLimited?: boolean;
|
|
968
|
+
/** Whether the evaluation was skipped due to suite-level budget exhaustion */
|
|
969
|
+
readonly budgetExceeded?: boolean;
|
|
958
970
|
}
|
|
959
971
|
type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
|
|
960
972
|
interface EvaluatorResult {
|
|
@@ -971,6 +983,8 @@ interface EvaluatorResult {
|
|
|
971
983
|
readonly scores?: readonly EvaluatorResult[];
|
|
972
984
|
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
|
|
973
985
|
readonly details?: JsonObject;
|
|
986
|
+
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
987
|
+
readonly tokenUsage?: TokenUsage;
|
|
974
988
|
}
|
|
975
989
|
/**
|
|
976
990
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -1116,6 +1130,8 @@ type EvalSuiteResult = {
|
|
|
1116
1130
|
readonly cacheConfig?: CacheConfig;
|
|
1117
1131
|
/** Suite-level metadata (name, description, version, etc.) */
|
|
1118
1132
|
readonly metadata?: EvalMetadata;
|
|
1133
|
+
/** Suite-level total cost budget in USD */
|
|
1134
|
+
readonly totalBudgetUsd?: number;
|
|
1119
1135
|
};
|
|
1120
1136
|
/**
|
|
1121
1137
|
* Load tests and suite metadata from a single parse.
|
|
@@ -1637,6 +1653,16 @@ interface EvaluationContext {
|
|
|
1637
1653
|
readonly output?: readonly Message[];
|
|
1638
1654
|
/** Lightweight summary of trace events (if available) */
|
|
1639
1655
|
readonly trace?: TraceSummary;
|
|
1656
|
+
/** Token usage from provider execution (promoted from TraceSummary) */
|
|
1657
|
+
readonly tokenUsage?: TokenUsage;
|
|
1658
|
+
/** Total cost in USD (from provider) */
|
|
1659
|
+
readonly costUsd?: number;
|
|
1660
|
+
/** Execution duration in milliseconds */
|
|
1661
|
+
readonly durationMs?: number;
|
|
1662
|
+
/** ISO 8601 timestamp when execution started */
|
|
1663
|
+
readonly startTime?: string;
|
|
1664
|
+
/** ISO 8601 timestamp when execution ended */
|
|
1665
|
+
readonly endTime?: string;
|
|
1640
1666
|
/** Resolver for target override in code judges */
|
|
1641
1667
|
readonly targetResolver?: TargetResolver;
|
|
1642
1668
|
/** List of available target names for code judges */
|
|
@@ -1657,6 +1683,8 @@ interface EvaluationScore {
|
|
|
1657
1683
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
1658
1684
|
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1659
1685
|
readonly details?: JsonObject;
|
|
1686
|
+
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1687
|
+
readonly tokenUsage?: TokenUsage;
|
|
1660
1688
|
}
|
|
1661
1689
|
interface ChildEvaluatorResult {
|
|
1662
1690
|
readonly name: string;
|
|
@@ -1671,6 +1699,8 @@ interface ChildEvaluatorResult {
|
|
|
1671
1699
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
1672
1700
|
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1673
1701
|
readonly details?: JsonObject;
|
|
1702
|
+
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1703
|
+
readonly tokenUsage?: TokenUsage;
|
|
1674
1704
|
}
|
|
1675
1705
|
interface Evaluator {
|
|
1676
1706
|
readonly kind: string;
|
|
@@ -1740,7 +1770,7 @@ interface CostEvaluatorOptions {
|
|
|
1740
1770
|
}
|
|
1741
1771
|
/**
|
|
1742
1772
|
* Evaluator that checks execution cost against a budget.
|
|
1743
|
-
* Uses
|
|
1773
|
+
* Uses costUsd from the evaluation context.
|
|
1744
1774
|
*/
|
|
1745
1775
|
declare class CostEvaluator implements Evaluator {
|
|
1746
1776
|
readonly kind = "cost";
|
|
@@ -1812,7 +1842,7 @@ interface LatencyEvaluatorOptions {
|
|
|
1812
1842
|
}
|
|
1813
1843
|
/**
|
|
1814
1844
|
* Evaluator that checks execution duration against a threshold.
|
|
1815
|
-
* Uses
|
|
1845
|
+
* Uses durationMs from the evaluation context.
|
|
1816
1846
|
*/
|
|
1817
1847
|
declare class LatencyEvaluator implements Evaluator {
|
|
1818
1848
|
readonly kind = "latency";
|
|
@@ -1987,7 +2017,7 @@ interface TokenUsageEvaluatorOptions {
|
|
|
1987
2017
|
}
|
|
1988
2018
|
/**
|
|
1989
2019
|
* Evaluator that checks provider-reported token usage against configured limits.
|
|
1990
|
-
* Uses
|
|
2020
|
+
* Uses tokenUsage from the evaluation context.
|
|
1991
2021
|
*/
|
|
1992
2022
|
declare class TokenUsageEvaluator implements Evaluator {
|
|
1993
2023
|
readonly kind = "token_usage";
|
|
@@ -2196,6 +2226,8 @@ interface RunEvaluationOptions {
|
|
|
2196
2226
|
readonly trials?: TrialsConfig;
|
|
2197
2227
|
/** Real-time observability callbacks passed to the provider */
|
|
2198
2228
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
2229
|
+
/** Suite-level total cost budget in USD (stops dispatching when exceeded) */
|
|
2230
|
+
readonly totalBudgetUsd?: number;
|
|
2199
2231
|
}
|
|
2200
2232
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
2201
2233
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
@@ -2886,4 +2918,4 @@ type AgentKernel = {
|
|
|
2886
2918
|
};
|
|
2887
2919
|
declare function createAgentKernel(): AgentKernel;
|
|
2888
2920
|
|
|
2889
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
2921
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|