@agentv/core 4.19.0-next.1 → 4.20.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -2647,6 +2647,8 @@ interface EvaluationContext {
2647
2647
  readonly availableTargets?: readonly string[];
2648
2648
  /** Unified diff of file changes from workspace */
2649
2649
  readonly fileChanges?: string;
2650
+ /** Formatted summary of tool calls from agent execution */
2651
+ readonly toolCalls?: string;
2650
2652
  /** Absolute path to the workspace directory */
2651
2653
  readonly workspacePath?: string;
2652
2654
  /** Docker workspace config: when present, code-grader commands run inside a container */
@@ -3024,6 +3026,22 @@ declare function buildScoreRangeOutputSchema(): string;
3024
3026
  */
3025
3027
  declare function extractImageBlocks(messages: readonly Message[]): ContentImage[];
3026
3028
 
3029
+ /**
3030
+ * Formats tool calls from agent output messages into a human-readable summary.
3031
+ *
3032
+ * Used by `{{ tool_calls }}` template variable in LLM grader prompts.
3033
+ * Extracts key input fields per tool to keep the summary compact:
3034
+ * - Skill: `skill` arg
3035
+ * - Read/Write/Edit: `file_path`
3036
+ * - Bash: `command`
3037
+ * - Grep/Glob: `pattern`
3038
+ * - Other tools: first string-valued input field (if any)
3039
+ *
3040
+ * Returns empty string when there are no tool calls (template variable resolves to '').
3041
+ */
3042
+
3043
+ declare function formatToolCalls(output: readonly Message[] | undefined): string;
3044
+
3027
3045
  /**
3028
3046
  * Built-in skill-trigger evaluator.
3029
3047
  *
@@ -3065,6 +3083,7 @@ declare function assembleLlmGraderPrompt(input: {
3065
3083
  evaluatorConfig?: LlmGraderConfig;
3066
3084
  output?: readonly Message[];
3067
3085
  fileChanges?: string;
3086
+ toolCalls?: string;
3068
3087
  graderTemplateOverride?: string;
3069
3088
  }): LlmGraderPromptAssembly;
3070
3089
 
@@ -3226,6 +3245,37 @@ declare class DeterministicAssertionGrader implements Grader {
3226
3245
  evaluate(context: EvaluationContext): EvaluationScore;
3227
3246
  }
3228
3247
 
3248
+ /**
3249
+ * Tracks cumulative cost across all eval files in a single CLI run.
3250
+ *
3251
+ * The per-suite budget (`execution.budget_usd` in YAML) is enforced by the orchestrator
3252
+ * and caps spend within one eval file. This tracker provides a **run-level** cap that
3253
+ * spans all files in a single `agentv run` invocation.
3254
+ *
3255
+ * Usage:
3256
+ * 1. Instantiate with the cap from `--budget-usd`.
3257
+ * 2. Share the tracker with each orchestrator running in the invocation.
3258
+ * 3. After each completed case, call `add()` with that case's total cost.
3259
+ * 4. Before dispatching the next case or file, check `isExceeded()`.
3260
+ *
3261
+ * Thread-safety note: AgentV mutates this tracker from async orchestration code, but all
3262
+ * updates occur on the JavaScript event loop. There is no shared-memory mutation across
3263
+ * threads, so simple cumulative accounting is sufficient here.
3264
+ */
3265
+ declare class RunBudgetTracker {
3266
+ private readonly capUsd;
3267
+ private cumulative;
3268
+ constructor(capUsd: number);
3269
+ /** Accumulate cost from a completed test or file. */
3270
+ add(costUsd: number): void;
3271
+ /** True when cumulative cost meets or exceeds the cap. */
3272
+ isExceeded(): boolean;
3273
+ /** Current accumulated cost. */
3274
+ get currentCostUsd(): number;
3275
+ /** The configured cap. */
3276
+ get budgetCapUsd(): number;
3277
+ }
3278
+
3229
3279
  /**
3230
3280
  * Validation error for a local repo source path that doesn't exist or is unresolved.
3231
3281
  */
@@ -3356,6 +3406,8 @@ interface RunEvaluationOptions {
3356
3406
  readonly streamCallbacks?: ProviderStreamCallbacks;
3357
3407
  /** Suite-level total cost budget in USD (stops dispatching when exceeded) */
3358
3408
  readonly budgetUsd?: number;
3409
+ /** Run-level total cost tracker shared across multiple eval files/targets in one CLI invocation */
3410
+ readonly runBudgetTracker?: RunBudgetTracker;
3359
3411
  /** Execution error tolerance: true halts on first error */
3360
3412
  readonly failOnError?: FailOnError;
3361
3413
  /** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
@@ -4596,4 +4648,4 @@ type AgentKernel = {
4596
4648
  };
4597
4649
  declare function createAgentKernel(): AgentKernel;
4598
4650
 
4599
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4651
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
package/dist/index.d.ts CHANGED
@@ -2647,6 +2647,8 @@ interface EvaluationContext {
2647
2647
  readonly availableTargets?: readonly string[];
2648
2648
  /** Unified diff of file changes from workspace */
2649
2649
  readonly fileChanges?: string;
2650
+ /** Formatted summary of tool calls from agent execution */
2651
+ readonly toolCalls?: string;
2650
2652
  /** Absolute path to the workspace directory */
2651
2653
  readonly workspacePath?: string;
2652
2654
  /** Docker workspace config: when present, code-grader commands run inside a container */
@@ -3024,6 +3026,22 @@ declare function buildScoreRangeOutputSchema(): string;
3024
3026
  */
3025
3027
  declare function extractImageBlocks(messages: readonly Message[]): ContentImage[];
3026
3028
 
3029
+ /**
3030
+ * Formats tool calls from agent output messages into a human-readable summary.
3031
+ *
3032
+ * Used by `{{ tool_calls }}` template variable in LLM grader prompts.
3033
+ * Extracts key input fields per tool to keep the summary compact:
3034
+ * - Skill: `skill` arg
3035
+ * - Read/Write/Edit: `file_path`
3036
+ * - Bash: `command`
3037
+ * - Grep/Glob: `pattern`
3038
+ * - Other tools: first string-valued input field (if any)
3039
+ *
3040
+ * Returns empty string when there are no tool calls (template variable resolves to '').
3041
+ */
3042
+
3043
+ declare function formatToolCalls(output: readonly Message[] | undefined): string;
3044
+
3027
3045
  /**
3028
3046
  * Built-in skill-trigger evaluator.
3029
3047
  *
@@ -3065,6 +3083,7 @@ declare function assembleLlmGraderPrompt(input: {
3065
3083
  evaluatorConfig?: LlmGraderConfig;
3066
3084
  output?: readonly Message[];
3067
3085
  fileChanges?: string;
3086
+ toolCalls?: string;
3068
3087
  graderTemplateOverride?: string;
3069
3088
  }): LlmGraderPromptAssembly;
3070
3089
 
@@ -3226,6 +3245,37 @@ declare class DeterministicAssertionGrader implements Grader {
3226
3245
  evaluate(context: EvaluationContext): EvaluationScore;
3227
3246
  }
3228
3247
 
3248
+ /**
3249
+ * Tracks cumulative cost across all eval files in a single CLI run.
3250
+ *
3251
+ * The per-suite budget (`execution.budget_usd` in YAML) is enforced by the orchestrator
3252
+ * and caps spend within one eval file. This tracker provides a **run-level** cap that
3253
+ * spans all files in a single `agentv run` invocation.
3254
+ *
3255
+ * Usage:
3256
+ * 1. Instantiate with the cap from `--budget-usd`.
3257
+ * 2. Share the tracker with each orchestrator running in the invocation.
3258
+ * 3. After each completed case, call `add()` with that case's total cost.
3259
+ * 4. Before dispatching the next case or file, check `isExceeded()`.
3260
+ *
3261
+ * Thread-safety note: AgentV mutates this tracker from async orchestration code, but all
3262
+ * updates occur on the JavaScript event loop. There is no shared-memory mutation across
3263
+ * threads, so simple cumulative accounting is sufficient here.
3264
+ */
3265
+ declare class RunBudgetTracker {
3266
+ private readonly capUsd;
3267
+ private cumulative;
3268
+ constructor(capUsd: number);
3269
+ /** Accumulate cost from a completed test or file. */
3270
+ add(costUsd: number): void;
3271
+ /** True when cumulative cost meets or exceeds the cap. */
3272
+ isExceeded(): boolean;
3273
+ /** Current accumulated cost. */
3274
+ get currentCostUsd(): number;
3275
+ /** The configured cap. */
3276
+ get budgetCapUsd(): number;
3277
+ }
3278
+
3229
3279
  /**
3230
3280
  * Validation error for a local repo source path that doesn't exist or is unresolved.
3231
3281
  */
@@ -3356,6 +3406,8 @@ interface RunEvaluationOptions {
3356
3406
  readonly streamCallbacks?: ProviderStreamCallbacks;
3357
3407
  /** Suite-level total cost budget in USD (stops dispatching when exceeded) */
3358
3408
  readonly budgetUsd?: number;
3409
+ /** Run-level total cost tracker shared across multiple eval files/targets in one CLI invocation */
3410
+ readonly runBudgetTracker?: RunBudgetTracker;
3359
3411
  /** Execution error tolerance: true halts on first error */
3360
3412
  readonly failOnError?: FailOnError;
3361
3413
  /** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
@@ -4596,4 +4648,4 @@ type AgentKernel = {
4596
4648
  };
4597
4649
  declare function createAgentKernel(): AgentKernel;
4598
4650
 
4599
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4651
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
package/dist/index.js CHANGED
@@ -69,6 +69,7 @@ import {
69
69
  extractThreshold,
70
70
  extractTrialsConfig,
71
71
  extractWorkersFromSuite,
72
+ formatToolCalls,
72
73
  freeformEvaluationSchema,
73
74
  getAgentvConfigDir,
74
75
  getAgentvHome,
@@ -127,7 +128,7 @@ import {
127
128
  toCamelCaseDeep,
128
129
  toSnakeCaseDeep,
129
130
  tokensPerTool
130
- } from "./chunk-QXX3IBYV.js";
131
+ } from "./chunk-ELF6SQAK.js";
131
132
  import {
132
133
  COMMON_TARGET_SETTINGS,
133
134
  TEST_MESSAGE_ROLES,
@@ -1679,6 +1680,30 @@ function toHrTime(iso) {
1679
1680
  return new Date(iso).getTime();
1680
1681
  }
1681
1682
 
1683
+ // src/evaluation/run-budget-tracker.ts
1684
+ var RunBudgetTracker = class {
1685
+ constructor(capUsd) {
1686
+ this.capUsd = capUsd;
1687
+ }
1688
+ cumulative = 0;
1689
+ /** Accumulate cost from a completed test or file. */
1690
+ add(costUsd) {
1691
+ this.cumulative += costUsd;
1692
+ }
1693
+ /** True when cumulative cost meets or exceeds the cap. */
1694
+ isExceeded() {
1695
+ return this.cumulative >= this.capUsd;
1696
+ }
1697
+ /** Current accumulated cost. */
1698
+ get currentCostUsd() {
1699
+ return this.cumulative;
1700
+ }
1701
+ /** The configured cap. */
1702
+ get budgetCapUsd() {
1703
+ return this.capUsd;
1704
+ }
1705
+ };
1706
+
1682
1707
  // src/import/claude-parser.ts
1683
1708
  var SKIPPED_TYPES = /* @__PURE__ */ new Set(["progress", "system", "file-history-snapshot"]);
1684
1709
  function parseClaudeSession(jsonl) {
@@ -2328,6 +2353,7 @@ export {
2328
2353
  ProviderRegistry,
2329
2354
  RepoManager,
2330
2355
  ResponseCache,
2356
+ RunBudgetTracker,
2331
2357
  SkillTriggerGrader,
2332
2358
  TEST_MESSAGE_ROLES,
2333
2359
  TemplateNotDirectoryError,
@@ -2399,6 +2425,7 @@ export {
2399
2425
  extractWorkersFromSuite,
2400
2426
  fileExists,
2401
2427
  findGitRoot,
2428
+ formatToolCalls,
2402
2429
  freeformEvaluationSchema,
2403
2430
  generateRubrics,
2404
2431
  getAgentvConfigDir,