@agentv/core 3.7.0 → 3.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-2IZOTQ25.js → chunk-PC5TLJF6.js} +143 -4
- package/dist/chunk-PC5TLJF6.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +228 -72
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +85 -37
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +519 -778
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -23
- package/dist/index.d.ts +11 -23
- package/dist/index.js +450 -841
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-2IZOTQ25.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -23,8 +23,6 @@ interface ProviderStreamCallbacks {
|
|
|
23
23
|
interface ProviderRequest {
|
|
24
24
|
readonly question: string;
|
|
25
25
|
readonly systemPrompt?: string;
|
|
26
|
-
readonly guidelines?: string;
|
|
27
|
-
readonly guideline_patterns?: readonly string[];
|
|
28
26
|
readonly chatPrompt?: ChatPrompt;
|
|
29
27
|
readonly inputFiles?: readonly string[];
|
|
30
28
|
readonly evalCaseId?: string;
|
|
@@ -1016,15 +1014,13 @@ type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | Composit
|
|
|
1016
1014
|
*/
|
|
1017
1015
|
interface EvalTest {
|
|
1018
1016
|
readonly id: string;
|
|
1019
|
-
readonly
|
|
1017
|
+
readonly eval_set?: string;
|
|
1020
1018
|
readonly conversation_id?: string;
|
|
1021
1019
|
readonly question: string;
|
|
1022
1020
|
readonly input: readonly TestMessage[];
|
|
1023
1021
|
readonly input_segments: readonly JsonObject[];
|
|
1024
1022
|
readonly expected_output: readonly JsonObject[];
|
|
1025
1023
|
readonly reference_answer?: string;
|
|
1026
|
-
readonly guideline_paths: readonly string[];
|
|
1027
|
-
readonly guideline_patterns?: readonly string[];
|
|
1028
1024
|
readonly file_paths: readonly string[];
|
|
1029
1025
|
readonly criteria: string;
|
|
1030
1026
|
readonly evaluator?: EvaluatorKind;
|
|
@@ -1128,11 +1124,10 @@ type FailOnError = boolean;
|
|
|
1128
1124
|
interface EvaluationResult {
|
|
1129
1125
|
readonly timestamp: string;
|
|
1130
1126
|
readonly testId: string;
|
|
1131
|
-
readonly
|
|
1127
|
+
readonly eval_set?: string;
|
|
1132
1128
|
readonly conversationId?: string;
|
|
1133
1129
|
readonly score: number;
|
|
1134
1130
|
readonly assertions: readonly AssertionEntry[];
|
|
1135
|
-
readonly outputText: string;
|
|
1136
1131
|
readonly target: string;
|
|
1137
1132
|
/** Token usage metrics from provider (optional) */
|
|
1138
1133
|
readonly tokenUsage?: TokenUsage;
|
|
@@ -1155,10 +1150,10 @@ interface EvaluationResult {
|
|
|
1155
1150
|
readonly trace?: TraceSummary;
|
|
1156
1151
|
/** Path to the temporary workspace directory (included on failure for debugging) */
|
|
1157
1152
|
readonly workspacePath?: string;
|
|
1158
|
-
/** Input messages
|
|
1159
|
-
readonly input?: readonly Message[]
|
|
1160
|
-
/**
|
|
1161
|
-
readonly output
|
|
1153
|
+
/** Input messages sent to the agent. Always Message[] for consistent shape with output. */
|
|
1154
|
+
readonly input?: readonly Message[];
|
|
1155
|
+
/** Output messages from agent execution. Always present — at minimum contains the final assistant message. */
|
|
1156
|
+
readonly output: readonly Message[];
|
|
1162
1157
|
/** Captured output from workspace before_all script */
|
|
1163
1158
|
readonly beforeAllOutput?: string;
|
|
1164
1159
|
/** Captured output from workspace before_each script */
|
|
@@ -1200,7 +1195,7 @@ interface EvaluatorResult {
|
|
|
1200
1195
|
readonly verdict?: EvaluationVerdict;
|
|
1201
1196
|
readonly assertions: readonly AssertionEntry[];
|
|
1202
1197
|
readonly rawRequest?: JsonObject;
|
|
1203
|
-
readonly
|
|
1198
|
+
readonly input?: JsonObject;
|
|
1204
1199
|
readonly scores?: readonly EvaluatorResult[];
|
|
1205
1200
|
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
|
|
1206
1201
|
readonly details?: JsonObject;
|
|
@@ -1262,7 +1257,6 @@ type ExecutionDefaults = {
|
|
|
1262
1257
|
};
|
|
1263
1258
|
type AgentVConfig$1 = {
|
|
1264
1259
|
readonly required_version?: string;
|
|
1265
|
-
readonly guideline_patterns?: readonly string[];
|
|
1266
1260
|
readonly eval_patterns?: readonly string[];
|
|
1267
1261
|
readonly execution?: ExecutionDefaults;
|
|
1268
1262
|
};
|
|
@@ -1271,10 +1265,6 @@ type AgentVConfig$1 = {
|
|
|
1271
1265
|
* Searches from eval file directory up to repo root.
|
|
1272
1266
|
*/
|
|
1273
1267
|
declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
|
|
1274
|
-
/**
|
|
1275
|
-
* Determine whether a path references guideline content (instructions or prompts).
|
|
1276
|
-
*/
|
|
1277
|
-
declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
|
|
1278
1268
|
/**
|
|
1279
1269
|
* Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
|
|
1280
1270
|
*/
|
|
@@ -1321,16 +1311,15 @@ declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
|
|
|
1321
1311
|
type FormattingMode = 'agent' | 'lm';
|
|
1322
1312
|
|
|
1323
1313
|
/**
|
|
1324
|
-
* Build prompt inputs by consolidating user request context
|
|
1314
|
+
* Build prompt inputs by consolidating user request context.
|
|
1325
1315
|
*/
|
|
1326
1316
|
interface PromptInputs {
|
|
1327
1317
|
readonly question: string;
|
|
1328
|
-
readonly guidelines: string;
|
|
1329
1318
|
readonly chatPrompt?: ChatPrompt;
|
|
1330
1319
|
readonly systemMessage?: string;
|
|
1331
1320
|
}
|
|
1332
1321
|
/**
|
|
1333
|
-
* Build prompt inputs by consolidating user request context
|
|
1322
|
+
* Build prompt inputs by consolidating user request context.
|
|
1334
1323
|
*
|
|
1335
1324
|
* @param testCase - The evaluation test case
|
|
1336
1325
|
* @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
|
|
@@ -2019,7 +2008,6 @@ interface EvaluationContext {
|
|
|
2019
2008
|
readonly attempt: number;
|
|
2020
2009
|
readonly promptInputs: {
|
|
2021
2010
|
readonly question: string;
|
|
2022
|
-
readonly guidelines: string;
|
|
2023
2011
|
readonly systemMessage?: string;
|
|
2024
2012
|
readonly chatPrompt?: ChatPrompt;
|
|
2025
2013
|
};
|
|
@@ -3500,7 +3488,7 @@ declare class OtelStreamingObserver {
|
|
|
3500
3488
|
private rootCtx;
|
|
3501
3489
|
constructor(tracer: Tracer, api: OtelApi, captureContent: boolean, parentCtx?: any | undefined);
|
|
3502
3490
|
/** Create root eval span immediately (visible in backend right away) */
|
|
3503
|
-
startEvalCase(testId: string, target: string,
|
|
3491
|
+
startEvalCase(testId: string, target: string, evalSet?: string): void;
|
|
3504
3492
|
/** Create and immediately export a tool span */
|
|
3505
3493
|
onToolCall(name: string, input: unknown, output: unknown, _durationMs: number, toolCallId?: string): void;
|
|
3506
3494
|
/** Create and immediately export an LLM span */
|
|
@@ -3613,4 +3601,4 @@ type AgentKernel = {
|
|
|
3613
3601
|
};
|
|
3614
3602
|
declare function createAgentKernel(): AgentKernel;
|
|
3615
3603
|
|
|
3616
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind,
|
|
3604
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -23,8 +23,6 @@ interface ProviderStreamCallbacks {
|
|
|
23
23
|
interface ProviderRequest {
|
|
24
24
|
readonly question: string;
|
|
25
25
|
readonly systemPrompt?: string;
|
|
26
|
-
readonly guidelines?: string;
|
|
27
|
-
readonly guideline_patterns?: readonly string[];
|
|
28
26
|
readonly chatPrompt?: ChatPrompt;
|
|
29
27
|
readonly inputFiles?: readonly string[];
|
|
30
28
|
readonly evalCaseId?: string;
|
|
@@ -1016,15 +1014,13 @@ type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | Composit
|
|
|
1016
1014
|
*/
|
|
1017
1015
|
interface EvalTest {
|
|
1018
1016
|
readonly id: string;
|
|
1019
|
-
readonly
|
|
1017
|
+
readonly eval_set?: string;
|
|
1020
1018
|
readonly conversation_id?: string;
|
|
1021
1019
|
readonly question: string;
|
|
1022
1020
|
readonly input: readonly TestMessage[];
|
|
1023
1021
|
readonly input_segments: readonly JsonObject[];
|
|
1024
1022
|
readonly expected_output: readonly JsonObject[];
|
|
1025
1023
|
readonly reference_answer?: string;
|
|
1026
|
-
readonly guideline_paths: readonly string[];
|
|
1027
|
-
readonly guideline_patterns?: readonly string[];
|
|
1028
1024
|
readonly file_paths: readonly string[];
|
|
1029
1025
|
readonly criteria: string;
|
|
1030
1026
|
readonly evaluator?: EvaluatorKind;
|
|
@@ -1128,11 +1124,10 @@ type FailOnError = boolean;
|
|
|
1128
1124
|
interface EvaluationResult {
|
|
1129
1125
|
readonly timestamp: string;
|
|
1130
1126
|
readonly testId: string;
|
|
1131
|
-
readonly
|
|
1127
|
+
readonly eval_set?: string;
|
|
1132
1128
|
readonly conversationId?: string;
|
|
1133
1129
|
readonly score: number;
|
|
1134
1130
|
readonly assertions: readonly AssertionEntry[];
|
|
1135
|
-
readonly outputText: string;
|
|
1136
1131
|
readonly target: string;
|
|
1137
1132
|
/** Token usage metrics from provider (optional) */
|
|
1138
1133
|
readonly tokenUsage?: TokenUsage;
|
|
@@ -1155,10 +1150,10 @@ interface EvaluationResult {
|
|
|
1155
1150
|
readonly trace?: TraceSummary;
|
|
1156
1151
|
/** Path to the temporary workspace directory (included on failure for debugging) */
|
|
1157
1152
|
readonly workspacePath?: string;
|
|
1158
|
-
/** Input messages
|
|
1159
|
-
readonly input?: readonly Message[]
|
|
1160
|
-
/**
|
|
1161
|
-
readonly output
|
|
1153
|
+
/** Input messages sent to the agent. Always Message[] for consistent shape with output. */
|
|
1154
|
+
readonly input?: readonly Message[];
|
|
1155
|
+
/** Output messages from agent execution. Always present — at minimum contains the final assistant message. */
|
|
1156
|
+
readonly output: readonly Message[];
|
|
1162
1157
|
/** Captured output from workspace before_all script */
|
|
1163
1158
|
readonly beforeAllOutput?: string;
|
|
1164
1159
|
/** Captured output from workspace before_each script */
|
|
@@ -1200,7 +1195,7 @@ interface EvaluatorResult {
|
|
|
1200
1195
|
readonly verdict?: EvaluationVerdict;
|
|
1201
1196
|
readonly assertions: readonly AssertionEntry[];
|
|
1202
1197
|
readonly rawRequest?: JsonObject;
|
|
1203
|
-
readonly
|
|
1198
|
+
readonly input?: JsonObject;
|
|
1204
1199
|
readonly scores?: readonly EvaluatorResult[];
|
|
1205
1200
|
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
|
|
1206
1201
|
readonly details?: JsonObject;
|
|
@@ -1262,7 +1257,6 @@ type ExecutionDefaults = {
|
|
|
1262
1257
|
};
|
|
1263
1258
|
type AgentVConfig$1 = {
|
|
1264
1259
|
readonly required_version?: string;
|
|
1265
|
-
readonly guideline_patterns?: readonly string[];
|
|
1266
1260
|
readonly eval_patterns?: readonly string[];
|
|
1267
1261
|
readonly execution?: ExecutionDefaults;
|
|
1268
1262
|
};
|
|
@@ -1271,10 +1265,6 @@ type AgentVConfig$1 = {
|
|
|
1271
1265
|
* Searches from eval file directory up to repo root.
|
|
1272
1266
|
*/
|
|
1273
1267
|
declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
|
|
1274
|
-
/**
|
|
1275
|
-
* Determine whether a path references guideline content (instructions or prompts).
|
|
1276
|
-
*/
|
|
1277
|
-
declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
|
|
1278
1268
|
/**
|
|
1279
1269
|
* Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
|
|
1280
1270
|
*/
|
|
@@ -1321,16 +1311,15 @@ declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
|
|
|
1321
1311
|
type FormattingMode = 'agent' | 'lm';
|
|
1322
1312
|
|
|
1323
1313
|
/**
|
|
1324
|
-
* Build prompt inputs by consolidating user request context
|
|
1314
|
+
* Build prompt inputs by consolidating user request context.
|
|
1325
1315
|
*/
|
|
1326
1316
|
interface PromptInputs {
|
|
1327
1317
|
readonly question: string;
|
|
1328
|
-
readonly guidelines: string;
|
|
1329
1318
|
readonly chatPrompt?: ChatPrompt;
|
|
1330
1319
|
readonly systemMessage?: string;
|
|
1331
1320
|
}
|
|
1332
1321
|
/**
|
|
1333
|
-
* Build prompt inputs by consolidating user request context
|
|
1322
|
+
* Build prompt inputs by consolidating user request context.
|
|
1334
1323
|
*
|
|
1335
1324
|
* @param testCase - The evaluation test case
|
|
1336
1325
|
* @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
|
|
@@ -2019,7 +2008,6 @@ interface EvaluationContext {
|
|
|
2019
2008
|
readonly attempt: number;
|
|
2020
2009
|
readonly promptInputs: {
|
|
2021
2010
|
readonly question: string;
|
|
2022
|
-
readonly guidelines: string;
|
|
2023
2011
|
readonly systemMessage?: string;
|
|
2024
2012
|
readonly chatPrompt?: ChatPrompt;
|
|
2025
2013
|
};
|
|
@@ -3500,7 +3488,7 @@ declare class OtelStreamingObserver {
|
|
|
3500
3488
|
private rootCtx;
|
|
3501
3489
|
constructor(tracer: Tracer, api: OtelApi, captureContent: boolean, parentCtx?: any | undefined);
|
|
3502
3490
|
/** Create root eval span immediately (visible in backend right away) */
|
|
3503
|
-
startEvalCase(testId: string, target: string,
|
|
3491
|
+
startEvalCase(testId: string, target: string, evalSet?: string): void;
|
|
3504
3492
|
/** Create and immediately export a tool span */
|
|
3505
3493
|
onToolCall(name: string, input: unknown, output: unknown, _durationMs: number, toolCallId?: string): void;
|
|
3506
3494
|
/** Create and immediately export an LLM span */
|
|
@@ -3613,4 +3601,4 @@ type AgentKernel = {
|
|
|
3613
3601
|
};
|
|
3614
3602
|
declare function createAgentKernel(): AgentKernel;
|
|
3615
3603
|
|
|
3616
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind,
|
|
3604
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|