@agentv/core 4.14.0 → 4.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-A3HYVKTI.js → chunk-AOOU6PLC.js} +70 -2
- package/dist/chunk-AOOU6PLC.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +89 -11
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +90 -12
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +774 -189
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +68 -14
- package/dist/index.d.ts +68 -14
- package/dist/index.js +705 -189
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-A3HYVKTI.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -256,6 +256,8 @@ interface TargetDefinition {
|
|
|
256
256
|
readonly log_directory?: string | unknown | undefined;
|
|
257
257
|
readonly log_format?: string | unknown | undefined;
|
|
258
258
|
readonly log_output_format?: string | unknown | undefined;
|
|
259
|
+
/** New stream_log field — replaces log_format. false=no stream log, 'raw'=per-event, 'summary'=consolidated. */
|
|
260
|
+
readonly stream_log?: string | boolean | unknown | undefined;
|
|
259
261
|
readonly system_prompt?: string | unknown | undefined;
|
|
260
262
|
readonly max_turns?: number | unknown | undefined;
|
|
261
263
|
readonly max_budget_usd?: number | unknown | undefined;
|
|
@@ -1146,6 +1148,37 @@ type InlineAssertEvaluatorConfig = {
|
|
|
1146
1148
|
readonly negate?: boolean;
|
|
1147
1149
|
};
|
|
1148
1150
|
type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
|
|
1151
|
+
/**
|
|
1152
|
+
* A single turn in a multi-turn conversation evaluation.
|
|
1153
|
+
* Each turn is a user message. The runner generates the assistant response.
|
|
1154
|
+
*/
|
|
1155
|
+
interface ConversationTurn {
|
|
1156
|
+
/** User message for this turn */
|
|
1157
|
+
readonly input: TestMessageContent;
|
|
1158
|
+
/** Reference assistant response for grading (NOT carried forward — actual LLM response is used) */
|
|
1159
|
+
readonly expected_output?: TestMessageContent;
|
|
1160
|
+
/** Per-turn assertions. Strings become rubric criteria via shorthand. */
|
|
1161
|
+
readonly assertions?: readonly (string | EvaluatorConfig)[];
|
|
1162
|
+
}
|
|
1163
|
+
/**
|
|
1164
|
+
* Conversation evaluation mode.
|
|
1165
|
+
* - undefined: standard single-response evaluation (default, backward-compatible)
|
|
1166
|
+
* - 'conversation': multi-turn evaluation where the LLM generates each assistant turn
|
|
1167
|
+
*/
|
|
1168
|
+
type ConversationMode = 'conversation';
|
|
1169
|
+
/**
|
|
1170
|
+
* Score aggregation strategy for multi-turn conversation evaluation.
|
|
1171
|
+
* - 'mean': average of all turn scores (default)
|
|
1172
|
+
* - 'min': weakest-link scoring — final score = lowest turn score
|
|
1173
|
+
* - 'max': best turn score
|
|
1174
|
+
*/
|
|
1175
|
+
type ConversationAggregation = 'mean' | 'min' | 'max';
|
|
1176
|
+
/**
|
|
1177
|
+
* Behavior when a turn's assertions fail.
|
|
1178
|
+
* - 'continue': run all remaining turns regardless (default)
|
|
1179
|
+
* - 'stop': skip remaining turns, score them as 0
|
|
1180
|
+
*/
|
|
1181
|
+
type TurnFailurePolicy = 'continue' | 'stop';
|
|
1149
1182
|
/**
|
|
1150
1183
|
* Eval test definition sourced from AgentV specs.
|
|
1151
1184
|
*/
|
|
@@ -1172,6 +1205,16 @@ interface EvalTest {
|
|
|
1172
1205
|
readonly targets?: readonly string[];
|
|
1173
1206
|
/** Per-test score threshold override (0-1). Resolution: CLI > test > suite > DEFAULT_THRESHOLD. */
|
|
1174
1207
|
readonly threshold?: number;
|
|
1208
|
+
/** Conversation evaluation mode. When 'conversation', turns[] drives turn-by-turn LLM evaluation. */
|
|
1209
|
+
readonly mode?: ConversationMode;
|
|
1210
|
+
/** Ordered turns for conversation evaluation. Each turn generates a fresh LLM call. */
|
|
1211
|
+
readonly turns?: readonly ConversationTurn[];
|
|
1212
|
+
/** Score aggregation for conversation turns: mean (default), min (weakest-link), max */
|
|
1213
|
+
readonly aggregation?: ConversationAggregation;
|
|
1214
|
+
/** Behavior on turn assertion failure: continue (default) or stop */
|
|
1215
|
+
readonly on_turn_failure?: TurnFailurePolicy;
|
|
1216
|
+
/** Sliding window size for context passed to per-turn graders. Default: all turns. */
|
|
1217
|
+
readonly window_size?: number;
|
|
1175
1218
|
/** Test IDs this test depends on. Dependent tests wait for all dependencies to complete before running. */
|
|
1176
1219
|
readonly depends_on?: readonly string[];
|
|
1177
1220
|
/** What to do when a dependency fails: skip (default), fail, or run anyway. */
|
|
@@ -1864,6 +1907,8 @@ interface CodexResolvedConfig {
|
|
|
1864
1907
|
readonly timeoutMs?: number;
|
|
1865
1908
|
readonly logDir?: string;
|
|
1866
1909
|
readonly logFormat?: 'summary' | 'json';
|
|
1910
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1911
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1867
1912
|
readonly systemPrompt?: string;
|
|
1868
1913
|
}
|
|
1869
1914
|
interface CopilotCliResolvedConfig {
|
|
@@ -1875,6 +1920,8 @@ interface CopilotCliResolvedConfig {
|
|
|
1875
1920
|
readonly timeoutMs?: number;
|
|
1876
1921
|
readonly logDir?: string;
|
|
1877
1922
|
readonly logFormat?: 'summary' | 'json';
|
|
1923
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1924
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1878
1925
|
readonly systemPrompt?: string;
|
|
1879
1926
|
}
|
|
1880
1927
|
interface CopilotSdkResolvedConfig {
|
|
@@ -1887,6 +1934,8 @@ interface CopilotSdkResolvedConfig {
|
|
|
1887
1934
|
readonly timeoutMs?: number;
|
|
1888
1935
|
readonly logDir?: string;
|
|
1889
1936
|
readonly logFormat?: 'summary' | 'json';
|
|
1937
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1938
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1890
1939
|
readonly systemPrompt?: string;
|
|
1891
1940
|
/** BYOK provider type: "azure", "openai", or "anthropic". */
|
|
1892
1941
|
readonly byokType?: string;
|
|
@@ -1925,6 +1974,8 @@ interface PiCodingAgentResolvedConfig {
|
|
|
1925
1974
|
readonly timeoutMs?: number;
|
|
1926
1975
|
readonly logDir?: string;
|
|
1927
1976
|
readonly logFormat?: 'summary' | 'json';
|
|
1977
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1978
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1928
1979
|
readonly systemPrompt?: string;
|
|
1929
1980
|
}
|
|
1930
1981
|
interface PiCliResolvedConfig {
|
|
@@ -1941,6 +1992,8 @@ interface PiCliResolvedConfig {
|
|
|
1941
1992
|
readonly timeoutMs?: number;
|
|
1942
1993
|
readonly logDir?: string;
|
|
1943
1994
|
readonly logFormat?: 'summary' | 'json';
|
|
1995
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1996
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1944
1997
|
readonly systemPrompt?: string;
|
|
1945
1998
|
}
|
|
1946
1999
|
interface ClaudeResolvedConfig {
|
|
@@ -1953,6 +2006,8 @@ interface ClaudeResolvedConfig {
|
|
|
1953
2006
|
readonly maxBudgetUsd?: number;
|
|
1954
2007
|
readonly logDir?: string;
|
|
1955
2008
|
readonly logFormat?: 'summary' | 'json';
|
|
2009
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
2010
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1956
2011
|
}
|
|
1957
2012
|
interface MockResolvedConfig {
|
|
1958
2013
|
readonly response?: string;
|
|
@@ -2658,30 +2713,29 @@ declare function extractImageBlocks(messages: readonly Message[]): ContentImage[
|
|
|
2658
2713
|
/**
|
|
2659
2714
|
* Built-in skill-trigger evaluator.
|
|
2660
2715
|
*
|
|
2661
|
-
* Detects whether the agent invoked a named skill
|
|
2662
|
-
*
|
|
2663
|
-
*
|
|
2716
|
+
* Detects whether the agent invoked a named skill during a session.
|
|
2717
|
+
* Works with canonical tool names produced by normalizeToolCall() — no
|
|
2718
|
+
* provider-specific matching logic needed.
|
|
2664
2719
|
*
|
|
2665
2720
|
* Detection logic:
|
|
2666
|
-
* -
|
|
2667
|
-
* - Skill tool: checks input.
|
|
2668
|
-
* - Read tool: checks input.
|
|
2669
|
-
* -
|
|
2721
|
+
* - Scans ALL tool calls (not just the first) for skill invocation evidence.
|
|
2722
|
+
* - Skill tool: checks `tool === 'Skill'` and `input.skill` contains the skill name.
|
|
2723
|
+
* - Read tool: checks `tool === 'Read'` and `input.file_path` contains a skills/ path.
|
|
2724
|
+
* - Fallback: checks tool output for skill file path references.
|
|
2670
2725
|
* - Supports negative cases via should_trigger: false.
|
|
2671
2726
|
*
|
|
2672
|
-
*
|
|
2673
|
-
*
|
|
2674
|
-
*
|
|
2675
|
-
*
|
|
2727
|
+
* Prerequisites:
|
|
2728
|
+
* All providers and import parsers must call normalizeToolCall() when
|
|
2729
|
+
* constructing ToolCall objects. This ensures canonical tool names
|
|
2730
|
+
* ("Skill", "Read", "Write", "Edit", "Bash") and canonical input field
|
|
2731
|
+
* names (input.skill, input.file_path) regardless of provider.
|
|
2676
2732
|
*/
|
|
2677
2733
|
|
|
2678
2734
|
declare class SkillTriggerEvaluator implements Evaluator {
|
|
2679
2735
|
readonly kind = "skill-trigger";
|
|
2680
2736
|
private readonly config;
|
|
2681
2737
|
constructor(config: SkillTriggerEvaluatorConfig);
|
|
2682
|
-
private resolveMatcher;
|
|
2683
2738
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2684
|
-
private readPathFromInput;
|
|
2685
2739
|
}
|
|
2686
2740
|
|
|
2687
2741
|
interface LlmGraderPromptAssembly {
|
|
@@ -4413,4 +4467,4 @@ type AgentKernel = {
|
|
|
4413
4467
|
};
|
|
4414
4468
|
declare function createAgentKernel(): AgentKernel;
|
|
4415
4469
|
|
|
4416
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
4470
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -256,6 +256,8 @@ interface TargetDefinition {
|
|
|
256
256
|
readonly log_directory?: string | unknown | undefined;
|
|
257
257
|
readonly log_format?: string | unknown | undefined;
|
|
258
258
|
readonly log_output_format?: string | unknown | undefined;
|
|
259
|
+
/** New stream_log field — replaces log_format. false=no stream log, 'raw'=per-event, 'summary'=consolidated. */
|
|
260
|
+
readonly stream_log?: string | boolean | unknown | undefined;
|
|
259
261
|
readonly system_prompt?: string | unknown | undefined;
|
|
260
262
|
readonly max_turns?: number | unknown | undefined;
|
|
261
263
|
readonly max_budget_usd?: number | unknown | undefined;
|
|
@@ -1146,6 +1148,37 @@ type InlineAssertEvaluatorConfig = {
|
|
|
1146
1148
|
readonly negate?: boolean;
|
|
1147
1149
|
};
|
|
1148
1150
|
type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
|
|
1151
|
+
/**
|
|
1152
|
+
* A single turn in a multi-turn conversation evaluation.
|
|
1153
|
+
* Each turn is a user message. The runner generates the assistant response.
|
|
1154
|
+
*/
|
|
1155
|
+
interface ConversationTurn {
|
|
1156
|
+
/** User message for this turn */
|
|
1157
|
+
readonly input: TestMessageContent;
|
|
1158
|
+
/** Reference assistant response for grading (NOT carried forward — actual LLM response is used) */
|
|
1159
|
+
readonly expected_output?: TestMessageContent;
|
|
1160
|
+
/** Per-turn assertions. Strings become rubric criteria via shorthand. */
|
|
1161
|
+
readonly assertions?: readonly (string | EvaluatorConfig)[];
|
|
1162
|
+
}
|
|
1163
|
+
/**
|
|
1164
|
+
* Conversation evaluation mode.
|
|
1165
|
+
* - undefined: standard single-response evaluation (default, backward-compatible)
|
|
1166
|
+
* - 'conversation': multi-turn evaluation where the LLM generates each assistant turn
|
|
1167
|
+
*/
|
|
1168
|
+
type ConversationMode = 'conversation';
|
|
1169
|
+
/**
|
|
1170
|
+
* Score aggregation strategy for multi-turn conversation evaluation.
|
|
1171
|
+
* - 'mean': average of all turn scores (default)
|
|
1172
|
+
* - 'min': weakest-link scoring — final score = lowest turn score
|
|
1173
|
+
* - 'max': best turn score
|
|
1174
|
+
*/
|
|
1175
|
+
type ConversationAggregation = 'mean' | 'min' | 'max';
|
|
1176
|
+
/**
|
|
1177
|
+
* Behavior when a turn's assertions fail.
|
|
1178
|
+
* - 'continue': run all remaining turns regardless (default)
|
|
1179
|
+
* - 'stop': skip remaining turns, score them as 0
|
|
1180
|
+
*/
|
|
1181
|
+
type TurnFailurePolicy = 'continue' | 'stop';
|
|
1149
1182
|
/**
|
|
1150
1183
|
* Eval test definition sourced from AgentV specs.
|
|
1151
1184
|
*/
|
|
@@ -1172,6 +1205,16 @@ interface EvalTest {
|
|
|
1172
1205
|
readonly targets?: readonly string[];
|
|
1173
1206
|
/** Per-test score threshold override (0-1). Resolution: CLI > test > suite > DEFAULT_THRESHOLD. */
|
|
1174
1207
|
readonly threshold?: number;
|
|
1208
|
+
/** Conversation evaluation mode. When 'conversation', turns[] drives turn-by-turn LLM evaluation. */
|
|
1209
|
+
readonly mode?: ConversationMode;
|
|
1210
|
+
/** Ordered turns for conversation evaluation. Each turn generates a fresh LLM call. */
|
|
1211
|
+
readonly turns?: readonly ConversationTurn[];
|
|
1212
|
+
/** Score aggregation for conversation turns: mean (default), min (weakest-link), max */
|
|
1213
|
+
readonly aggregation?: ConversationAggregation;
|
|
1214
|
+
/** Behavior on turn assertion failure: continue (default) or stop */
|
|
1215
|
+
readonly on_turn_failure?: TurnFailurePolicy;
|
|
1216
|
+
/** Sliding window size for context passed to per-turn graders. Default: all turns. */
|
|
1217
|
+
readonly window_size?: number;
|
|
1175
1218
|
/** Test IDs this test depends on. Dependent tests wait for all dependencies to complete before running. */
|
|
1176
1219
|
readonly depends_on?: readonly string[];
|
|
1177
1220
|
/** What to do when a dependency fails: skip (default), fail, or run anyway. */
|
|
@@ -1864,6 +1907,8 @@ interface CodexResolvedConfig {
|
|
|
1864
1907
|
readonly timeoutMs?: number;
|
|
1865
1908
|
readonly logDir?: string;
|
|
1866
1909
|
readonly logFormat?: 'summary' | 'json';
|
|
1910
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1911
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1867
1912
|
readonly systemPrompt?: string;
|
|
1868
1913
|
}
|
|
1869
1914
|
interface CopilotCliResolvedConfig {
|
|
@@ -1875,6 +1920,8 @@ interface CopilotCliResolvedConfig {
|
|
|
1875
1920
|
readonly timeoutMs?: number;
|
|
1876
1921
|
readonly logDir?: string;
|
|
1877
1922
|
readonly logFormat?: 'summary' | 'json';
|
|
1923
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1924
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1878
1925
|
readonly systemPrompt?: string;
|
|
1879
1926
|
}
|
|
1880
1927
|
interface CopilotSdkResolvedConfig {
|
|
@@ -1887,6 +1934,8 @@ interface CopilotSdkResolvedConfig {
|
|
|
1887
1934
|
readonly timeoutMs?: number;
|
|
1888
1935
|
readonly logDir?: string;
|
|
1889
1936
|
readonly logFormat?: 'summary' | 'json';
|
|
1937
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1938
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1890
1939
|
readonly systemPrompt?: string;
|
|
1891
1940
|
/** BYOK provider type: "azure", "openai", or "anthropic". */
|
|
1892
1941
|
readonly byokType?: string;
|
|
@@ -1925,6 +1974,8 @@ interface PiCodingAgentResolvedConfig {
|
|
|
1925
1974
|
readonly timeoutMs?: number;
|
|
1926
1975
|
readonly logDir?: string;
|
|
1927
1976
|
readonly logFormat?: 'summary' | 'json';
|
|
1977
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1978
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1928
1979
|
readonly systemPrompt?: string;
|
|
1929
1980
|
}
|
|
1930
1981
|
interface PiCliResolvedConfig {
|
|
@@ -1941,6 +1992,8 @@ interface PiCliResolvedConfig {
|
|
|
1941
1992
|
readonly timeoutMs?: number;
|
|
1942
1993
|
readonly logDir?: string;
|
|
1943
1994
|
readonly logFormat?: 'summary' | 'json';
|
|
1995
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1996
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1944
1997
|
readonly systemPrompt?: string;
|
|
1945
1998
|
}
|
|
1946
1999
|
interface ClaudeResolvedConfig {
|
|
@@ -1953,6 +2006,8 @@ interface ClaudeResolvedConfig {
|
|
|
1953
2006
|
readonly maxBudgetUsd?: number;
|
|
1954
2007
|
readonly logDir?: string;
|
|
1955
2008
|
readonly logFormat?: 'summary' | 'json';
|
|
2009
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
2010
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1956
2011
|
}
|
|
1957
2012
|
interface MockResolvedConfig {
|
|
1958
2013
|
readonly response?: string;
|
|
@@ -2658,30 +2713,29 @@ declare function extractImageBlocks(messages: readonly Message[]): ContentImage[
|
|
|
2658
2713
|
/**
|
|
2659
2714
|
* Built-in skill-trigger evaluator.
|
|
2660
2715
|
*
|
|
2661
|
-
* Detects whether the agent invoked a named skill
|
|
2662
|
-
*
|
|
2663
|
-
*
|
|
2716
|
+
* Detects whether the agent invoked a named skill during a session.
|
|
2717
|
+
* Works with canonical tool names produced by normalizeToolCall() — no
|
|
2718
|
+
* provider-specific matching logic needed.
|
|
2664
2719
|
*
|
|
2665
2720
|
* Detection logic:
|
|
2666
|
-
* -
|
|
2667
|
-
* - Skill tool: checks input.
|
|
2668
|
-
* - Read tool: checks input.
|
|
2669
|
-
* -
|
|
2721
|
+
* - Scans ALL tool calls (not just the first) for skill invocation evidence.
|
|
2722
|
+
* - Skill tool: checks `tool === 'Skill'` and `input.skill` contains the skill name.
|
|
2723
|
+
* - Read tool: checks `tool === 'Read'` and `input.file_path` contains a skills/ path.
|
|
2724
|
+
* - Fallback: checks tool output for skill file path references.
|
|
2670
2725
|
* - Supports negative cases via should_trigger: false.
|
|
2671
2726
|
*
|
|
2672
|
-
*
|
|
2673
|
-
*
|
|
2674
|
-
*
|
|
2675
|
-
*
|
|
2727
|
+
* Prerequisites:
|
|
2728
|
+
* All providers and import parsers must call normalizeToolCall() when
|
|
2729
|
+
* constructing ToolCall objects. This ensures canonical tool names
|
|
2730
|
+
* ("Skill", "Read", "Write", "Edit", "Bash") and canonical input field
|
|
2731
|
+
* names (input.skill, input.file_path) regardless of provider.
|
|
2676
2732
|
*/
|
|
2677
2733
|
|
|
2678
2734
|
declare class SkillTriggerEvaluator implements Evaluator {
|
|
2679
2735
|
readonly kind = "skill-trigger";
|
|
2680
2736
|
private readonly config;
|
|
2681
2737
|
constructor(config: SkillTriggerEvaluatorConfig);
|
|
2682
|
-
private resolveMatcher;
|
|
2683
2738
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2684
|
-
private readPathFromInput;
|
|
2685
2739
|
}
|
|
2686
2740
|
|
|
2687
2741
|
interface LlmGraderPromptAssembly {
|
|
@@ -4413,4 +4467,4 @@ type AgentKernel = {
|
|
|
4413
4467
|
};
|
|
4414
4468
|
declare function createAgentKernel(): AgentKernel;
|
|
4415
4469
|
|
|
4416
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
4470
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|