@agentv/core 3.7.0 → 3.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -23,8 +23,6 @@ interface ProviderStreamCallbacks {
23
23
  interface ProviderRequest {
24
24
  readonly question: string;
25
25
  readonly systemPrompt?: string;
26
- readonly guidelines?: string;
27
- readonly guideline_patterns?: readonly string[];
28
26
  readonly chatPrompt?: ChatPrompt;
29
27
  readonly inputFiles?: readonly string[];
30
28
  readonly evalCaseId?: string;
@@ -1016,15 +1014,13 @@ type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | Composit
1016
1014
  */
1017
1015
  interface EvalTest {
1018
1016
  readonly id: string;
1019
- readonly dataset?: string;
1017
+ readonly eval_set?: string;
1020
1018
  readonly conversation_id?: string;
1021
1019
  readonly question: string;
1022
1020
  readonly input: readonly TestMessage[];
1023
1021
  readonly input_segments: readonly JsonObject[];
1024
1022
  readonly expected_output: readonly JsonObject[];
1025
1023
  readonly reference_answer?: string;
1026
- readonly guideline_paths: readonly string[];
1027
- readonly guideline_patterns?: readonly string[];
1028
1024
  readonly file_paths: readonly string[];
1029
1025
  readonly criteria: string;
1030
1026
  readonly evaluator?: EvaluatorKind;
@@ -1128,11 +1124,10 @@ type FailOnError = boolean;
1128
1124
  interface EvaluationResult {
1129
1125
  readonly timestamp: string;
1130
1126
  readonly testId: string;
1131
- readonly dataset?: string;
1127
+ readonly eval_set?: string;
1132
1128
  readonly conversationId?: string;
1133
1129
  readonly score: number;
1134
1130
  readonly assertions: readonly AssertionEntry[];
1135
- readonly outputText: string;
1136
1131
  readonly target: string;
1137
1132
  /** Token usage metrics from provider (optional) */
1138
1133
  readonly tokenUsage?: TokenUsage;
@@ -1155,10 +1150,10 @@ interface EvaluationResult {
1155
1150
  readonly trace?: TraceSummary;
1156
1151
  /** Path to the temporary workspace directory (included on failure for debugging) */
1157
1152
  readonly workspacePath?: string;
1158
- /** Input messages or prompt string sent to the agent */
1159
- readonly input?: readonly Message[] | string;
1160
- /** Full output messages from agent execution (only included when --trace flag is set) */
1161
- readonly output?: readonly Message[];
1153
+ /** Input messages sent to the agent. Always Message[] for consistent shape with output. */
1154
+ readonly input?: readonly Message[];
1155
+ /** Output messages from agent execution. Always present at minimum contains the final assistant message. */
1156
+ readonly output: readonly Message[];
1162
1157
  /** Captured output from workspace before_all script */
1163
1158
  readonly beforeAllOutput?: string;
1164
1159
  /** Captured output from workspace before_each script */
@@ -1200,7 +1195,7 @@ interface EvaluatorResult {
1200
1195
  readonly verdict?: EvaluationVerdict;
1201
1196
  readonly assertions: readonly AssertionEntry[];
1202
1197
  readonly rawRequest?: JsonObject;
1203
- readonly evaluatorProviderRequest?: JsonObject;
1198
+ readonly input?: JsonObject;
1204
1199
  readonly scores?: readonly EvaluatorResult[];
1205
1200
  /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
1206
1201
  readonly details?: JsonObject;
@@ -1262,7 +1257,6 @@ type ExecutionDefaults = {
1262
1257
  };
1263
1258
  type AgentVConfig$1 = {
1264
1259
  readonly required_version?: string;
1265
- readonly guideline_patterns?: readonly string[];
1266
1260
  readonly eval_patterns?: readonly string[];
1267
1261
  readonly execution?: ExecutionDefaults;
1268
1262
  };
@@ -1271,10 +1265,6 @@ type AgentVConfig$1 = {
1271
1265
  * Searches from eval file directory up to repo root.
1272
1266
  */
1273
1267
  declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
1274
- /**
1275
- * Determine whether a path references guideline content (instructions or prompts).
1276
- */
1277
- declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
1278
1268
  /**
1279
1269
  * Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
1280
1270
  */
@@ -1321,16 +1311,15 @@ declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
1321
1311
  type FormattingMode = 'agent' | 'lm';
1322
1312
 
1323
1313
  /**
1324
- * Build prompt inputs by consolidating user request context and guideline content.
1314
+ * Build prompt inputs by consolidating user request context.
1325
1315
  */
1326
1316
  interface PromptInputs {
1327
1317
  readonly question: string;
1328
- readonly guidelines: string;
1329
1318
  readonly chatPrompt?: ChatPrompt;
1330
1319
  readonly systemMessage?: string;
1331
1320
  }
1332
1321
  /**
1333
- * Build prompt inputs by consolidating user request context and guideline content.
1322
+ * Build prompt inputs by consolidating user request context.
1334
1323
  *
1335
1324
  * @param testCase - The evaluation test case
1336
1325
  * @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
@@ -2019,7 +2008,6 @@ interface EvaluationContext {
2019
2008
  readonly attempt: number;
2020
2009
  readonly promptInputs: {
2021
2010
  readonly question: string;
2022
- readonly guidelines: string;
2023
2011
  readonly systemMessage?: string;
2024
2012
  readonly chatPrompt?: ChatPrompt;
2025
2013
  };
@@ -3500,7 +3488,7 @@ declare class OtelStreamingObserver {
3500
3488
  private rootCtx;
3501
3489
  constructor(tracer: Tracer, api: OtelApi, captureContent: boolean, parentCtx?: any | undefined);
3502
3490
  /** Create root eval span immediately (visible in backend right away) */
3503
- startEvalCase(testId: string, target: string, dataset?: string): void;
3491
+ startEvalCase(testId: string, target: string, evalSet?: string): void;
3504
3492
  /** Create and immediately export a tool span */
3505
3493
  onToolCall(name: string, input: unknown, output: unknown, _durationMs: number, toolCallId?: string): void;
3506
3494
  /** Create and immediately export an LLM span */
@@ -3613,4 +3601,4 @@ type AgentKernel = {
3613
3601
  };
3614
3602
  declare function createAgentKernel(): AgentKernel;
3615
3603
 
3616
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
3604
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
package/dist/index.d.ts CHANGED
@@ -23,8 +23,6 @@ interface ProviderStreamCallbacks {
23
23
  interface ProviderRequest {
24
24
  readonly question: string;
25
25
  readonly systemPrompt?: string;
26
- readonly guidelines?: string;
27
- readonly guideline_patterns?: readonly string[];
28
26
  readonly chatPrompt?: ChatPrompt;
29
27
  readonly inputFiles?: readonly string[];
30
28
  readonly evalCaseId?: string;
@@ -1016,15 +1014,13 @@ type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | Composit
1016
1014
  */
1017
1015
  interface EvalTest {
1018
1016
  readonly id: string;
1019
- readonly dataset?: string;
1017
+ readonly eval_set?: string;
1020
1018
  readonly conversation_id?: string;
1021
1019
  readonly question: string;
1022
1020
  readonly input: readonly TestMessage[];
1023
1021
  readonly input_segments: readonly JsonObject[];
1024
1022
  readonly expected_output: readonly JsonObject[];
1025
1023
  readonly reference_answer?: string;
1026
- readonly guideline_paths: readonly string[];
1027
- readonly guideline_patterns?: readonly string[];
1028
1024
  readonly file_paths: readonly string[];
1029
1025
  readonly criteria: string;
1030
1026
  readonly evaluator?: EvaluatorKind;
@@ -1128,11 +1124,10 @@ type FailOnError = boolean;
1128
1124
  interface EvaluationResult {
1129
1125
  readonly timestamp: string;
1130
1126
  readonly testId: string;
1131
- readonly dataset?: string;
1127
+ readonly eval_set?: string;
1132
1128
  readonly conversationId?: string;
1133
1129
  readonly score: number;
1134
1130
  readonly assertions: readonly AssertionEntry[];
1135
- readonly outputText: string;
1136
1131
  readonly target: string;
1137
1132
  /** Token usage metrics from provider (optional) */
1138
1133
  readonly tokenUsage?: TokenUsage;
@@ -1155,10 +1150,10 @@ interface EvaluationResult {
1155
1150
  readonly trace?: TraceSummary;
1156
1151
  /** Path to the temporary workspace directory (included on failure for debugging) */
1157
1152
  readonly workspacePath?: string;
1158
- /** Input messages or prompt string sent to the agent */
1159
- readonly input?: readonly Message[] | string;
1160
- /** Full output messages from agent execution (only included when --trace flag is set) */
1161
- readonly output?: readonly Message[];
1153
+ /** Input messages sent to the agent. Always Message[] for consistent shape with output. */
1154
+ readonly input?: readonly Message[];
1155
+ /** Output messages from agent execution. Always present at minimum contains the final assistant message. */
1156
+ readonly output: readonly Message[];
1162
1157
  /** Captured output from workspace before_all script */
1163
1158
  readonly beforeAllOutput?: string;
1164
1159
  /** Captured output from workspace before_each script */
@@ -1200,7 +1195,7 @@ interface EvaluatorResult {
1200
1195
  readonly verdict?: EvaluationVerdict;
1201
1196
  readonly assertions: readonly AssertionEntry[];
1202
1197
  readonly rawRequest?: JsonObject;
1203
- readonly evaluatorProviderRequest?: JsonObject;
1198
+ readonly input?: JsonObject;
1204
1199
  readonly scores?: readonly EvaluatorResult[];
1205
1200
  /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
1206
1201
  readonly details?: JsonObject;
@@ -1262,7 +1257,6 @@ type ExecutionDefaults = {
1262
1257
  };
1263
1258
  type AgentVConfig$1 = {
1264
1259
  readonly required_version?: string;
1265
- readonly guideline_patterns?: readonly string[];
1266
1260
  readonly eval_patterns?: readonly string[];
1267
1261
  readonly execution?: ExecutionDefaults;
1268
1262
  };
@@ -1271,10 +1265,6 @@ type AgentVConfig$1 = {
1271
1265
  * Searches from eval file directory up to repo root.
1272
1266
  */
1273
1267
  declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
1274
- /**
1275
- * Determine whether a path references guideline content (instructions or prompts).
1276
- */
1277
- declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
1278
1268
  /**
1279
1269
  * Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
1280
1270
  */
@@ -1321,16 +1311,15 @@ declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
1321
1311
  type FormattingMode = 'agent' | 'lm';
1322
1312
 
1323
1313
  /**
1324
- * Build prompt inputs by consolidating user request context and guideline content.
1314
+ * Build prompt inputs by consolidating user request context.
1325
1315
  */
1326
1316
  interface PromptInputs {
1327
1317
  readonly question: string;
1328
- readonly guidelines: string;
1329
1318
  readonly chatPrompt?: ChatPrompt;
1330
1319
  readonly systemMessage?: string;
1331
1320
  }
1332
1321
  /**
1333
- * Build prompt inputs by consolidating user request context and guideline content.
1322
+ * Build prompt inputs by consolidating user request context.
1334
1323
  *
1335
1324
  * @param testCase - The evaluation test case
1336
1325
  * @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
@@ -2019,7 +2008,6 @@ interface EvaluationContext {
2019
2008
  readonly attempt: number;
2020
2009
  readonly promptInputs: {
2021
2010
  readonly question: string;
2022
- readonly guidelines: string;
2023
2011
  readonly systemMessage?: string;
2024
2012
  readonly chatPrompt?: ChatPrompt;
2025
2013
  };
@@ -3500,7 +3488,7 @@ declare class OtelStreamingObserver {
3500
3488
  private rootCtx;
3501
3489
  constructor(tracer: Tracer, api: OtelApi, captureContent: boolean, parentCtx?: any | undefined);
3502
3490
  /** Create root eval span immediately (visible in backend right away) */
3503
- startEvalCase(testId: string, target: string, dataset?: string): void;
3491
+ startEvalCase(testId: string, target: string, evalSet?: string): void;
3504
3492
  /** Create and immediately export a tool span */
3505
3493
  onToolCall(name: string, input: unknown, output: unknown, _durationMs: number, toolCallId?: string): void;
3506
3494
  /** Create and immediately export an LLM span */
@@ -3613,4 +3601,4 @@ type AgentKernel = {
3613
3601
  };
3614
3602
  declare function createAgentKernel(): AgentKernel;
3615
3603
 
3616
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
3604
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };