@agentv/core 4.5.2 → 4.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-M65PVDQ5.js → chunk-AIQ5FO4G.js} +27 -5
- package/dist/chunk-AIQ5FO4G.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +15 -6
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +7 -4
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +108 -44
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +25 -3
- package/dist/index.d.ts +25 -3
- package/dist/index.js +83 -41
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-M65PVDQ5.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -215,7 +215,8 @@ interface Provider {
|
|
|
215
215
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
216
216
|
interface TargetDefinition {
|
|
217
217
|
readonly name: string;
|
|
218
|
-
readonly provider
|
|
218
|
+
readonly provider?: ProviderKind | string;
|
|
219
|
+
readonly use_target?: string | unknown | undefined;
|
|
219
220
|
readonly grader_target?: string | undefined;
|
|
220
221
|
/** @deprecated Use `grader_target` instead */
|
|
221
222
|
readonly judge_target?: string | undefined;
|
|
@@ -302,6 +303,8 @@ interface TargetDefinition {
|
|
|
302
303
|
readonly retryBackoffFactor?: number | unknown | undefined;
|
|
303
304
|
readonly retry_status_codes?: unknown | undefined;
|
|
304
305
|
readonly retryStatusCodes?: unknown | undefined;
|
|
306
|
+
readonly fallback_targets?: readonly string[] | unknown | undefined;
|
|
307
|
+
readonly fallbackTargets?: readonly string[] | unknown | undefined;
|
|
305
308
|
}
|
|
306
309
|
|
|
307
310
|
/**
|
|
@@ -1200,6 +1203,11 @@ interface EvaluationResult {
|
|
|
1200
1203
|
readonly score: number;
|
|
1201
1204
|
readonly assertions: readonly AssertionEntry[];
|
|
1202
1205
|
readonly target: string;
|
|
1206
|
+
/**
|
|
1207
|
+
* The target that actually served the response, when different from the
|
|
1208
|
+
* primary target. Present only when a fallback target was used.
|
|
1209
|
+
*/
|
|
1210
|
+
readonly targetUsed?: string;
|
|
1203
1211
|
/** Token usage metrics from provider (optional) */
|
|
1204
1212
|
readonly tokenUsage?: TokenUsage;
|
|
1205
1213
|
/** Total cost in USD (optional, from provider) */
|
|
@@ -1683,6 +1691,14 @@ interface RetryConfig {
|
|
|
1683
1691
|
readonly backoffFactor?: number;
|
|
1684
1692
|
readonly retryableStatusCodes?: readonly number[];
|
|
1685
1693
|
}
|
|
1694
|
+
/**
|
|
1695
|
+
* Selects which OpenAI-compatible API endpoint to use.
|
|
1696
|
+
* - "chat" (default): POST /chat/completions — universally supported by all OpenAI-compatible providers.
|
|
1697
|
+
* - "responses": POST /responses — only supported by api.openai.com.
|
|
1698
|
+
*
|
|
1699
|
+
* Maps to Vercel AI SDK methods: "chat" → provider.chat(model), "responses" → provider(model).
|
|
1700
|
+
*/
|
|
1701
|
+
type ApiFormat = 'chat' | 'responses';
|
|
1686
1702
|
/**
|
|
1687
1703
|
* Azure OpenAI settings used by the Vercel AI SDK.
|
|
1688
1704
|
*/
|
|
@@ -1702,6 +1718,7 @@ interface OpenAIResolvedConfig {
|
|
|
1702
1718
|
readonly baseURL: string;
|
|
1703
1719
|
readonly apiKey: string;
|
|
1704
1720
|
readonly model: string;
|
|
1721
|
+
readonly apiFormat?: ApiFormat;
|
|
1705
1722
|
readonly temperature?: number;
|
|
1706
1723
|
readonly maxOutputTokens?: number;
|
|
1707
1724
|
readonly retry?: RetryConfig;
|
|
@@ -1852,6 +1869,11 @@ interface ResolvedTargetBase {
|
|
|
1852
1869
|
* to force CLI invocation even in subagent mode.
|
|
1853
1870
|
*/
|
|
1854
1871
|
readonly subagentModeAllowed?: boolean;
|
|
1872
|
+
/**
|
|
1873
|
+
* Ordered list of target names to try when the primary target fails after
|
|
1874
|
+
* exhausting retries. Each fallback is attempted in order.
|
|
1875
|
+
*/
|
|
1876
|
+
readonly fallbackTargets?: readonly string[];
|
|
1855
1877
|
}
|
|
1856
1878
|
type ResolvedTarget = (ResolvedTargetBase & {
|
|
1857
1879
|
readonly kind: 'openai';
|
|
@@ -1913,7 +1935,7 @@ type ResolvedTarget = (ResolvedTargetBase & {
|
|
|
1913
1935
|
* Exported so the targets validator can reuse the same list — adding a field
|
|
1914
1936
|
* here automatically makes it valid in targets.yaml without a separate update.
|
|
1915
1937
|
*/
|
|
1916
|
-
declare const COMMON_TARGET_SETTINGS: readonly ["provider_batching", "providerBatching", "subagent_mode_allowed", "subagentModeAllowed"];
|
|
1938
|
+
declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "providerBatching", "subagent_mode_allowed", "subagentModeAllowed", "fallback_targets", "fallbackTargets"];
|
|
1917
1939
|
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string): ResolvedTarget;
|
|
1918
1940
|
|
|
1919
1941
|
/**
|
|
@@ -3887,4 +3909,4 @@ type AgentKernel = {
|
|
|
3887
3909
|
};
|
|
3888
3910
|
declare function createAgentKernel(): AgentKernel;
|
|
3889
3911
|
|
|
3890
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, removeProject, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
3912
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, removeProject, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -215,7 +215,8 @@ interface Provider {
|
|
|
215
215
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
216
216
|
interface TargetDefinition {
|
|
217
217
|
readonly name: string;
|
|
218
|
-
readonly provider
|
|
218
|
+
readonly provider?: ProviderKind | string;
|
|
219
|
+
readonly use_target?: string | unknown | undefined;
|
|
219
220
|
readonly grader_target?: string | undefined;
|
|
220
221
|
/** @deprecated Use `grader_target` instead */
|
|
221
222
|
readonly judge_target?: string | undefined;
|
|
@@ -302,6 +303,8 @@ interface TargetDefinition {
|
|
|
302
303
|
readonly retryBackoffFactor?: number | unknown | undefined;
|
|
303
304
|
readonly retry_status_codes?: unknown | undefined;
|
|
304
305
|
readonly retryStatusCodes?: unknown | undefined;
|
|
306
|
+
readonly fallback_targets?: readonly string[] | unknown | undefined;
|
|
307
|
+
readonly fallbackTargets?: readonly string[] | unknown | undefined;
|
|
305
308
|
}
|
|
306
309
|
|
|
307
310
|
/**
|
|
@@ -1200,6 +1203,11 @@ interface EvaluationResult {
|
|
|
1200
1203
|
readonly score: number;
|
|
1201
1204
|
readonly assertions: readonly AssertionEntry[];
|
|
1202
1205
|
readonly target: string;
|
|
1206
|
+
/**
|
|
1207
|
+
* The target that actually served the response, when different from the
|
|
1208
|
+
* primary target. Present only when a fallback target was used.
|
|
1209
|
+
*/
|
|
1210
|
+
readonly targetUsed?: string;
|
|
1203
1211
|
/** Token usage metrics from provider (optional) */
|
|
1204
1212
|
readonly tokenUsage?: TokenUsage;
|
|
1205
1213
|
/** Total cost in USD (optional, from provider) */
|
|
@@ -1683,6 +1691,14 @@ interface RetryConfig {
|
|
|
1683
1691
|
readonly backoffFactor?: number;
|
|
1684
1692
|
readonly retryableStatusCodes?: readonly number[];
|
|
1685
1693
|
}
|
|
1694
|
+
/**
|
|
1695
|
+
* Selects which OpenAI-compatible API endpoint to use.
|
|
1696
|
+
* - "chat" (default): POST /chat/completions — universally supported by all OpenAI-compatible providers.
|
|
1697
|
+
* - "responses": POST /responses — only supported by api.openai.com.
|
|
1698
|
+
*
|
|
1699
|
+
* Maps to Vercel AI SDK methods: "chat" → provider.chat(model), "responses" → provider(model).
|
|
1700
|
+
*/
|
|
1701
|
+
type ApiFormat = 'chat' | 'responses';
|
|
1686
1702
|
/**
|
|
1687
1703
|
* Azure OpenAI settings used by the Vercel AI SDK.
|
|
1688
1704
|
*/
|
|
@@ -1702,6 +1718,7 @@ interface OpenAIResolvedConfig {
|
|
|
1702
1718
|
readonly baseURL: string;
|
|
1703
1719
|
readonly apiKey: string;
|
|
1704
1720
|
readonly model: string;
|
|
1721
|
+
readonly apiFormat?: ApiFormat;
|
|
1705
1722
|
readonly temperature?: number;
|
|
1706
1723
|
readonly maxOutputTokens?: number;
|
|
1707
1724
|
readonly retry?: RetryConfig;
|
|
@@ -1852,6 +1869,11 @@ interface ResolvedTargetBase {
|
|
|
1852
1869
|
* to force CLI invocation even in subagent mode.
|
|
1853
1870
|
*/
|
|
1854
1871
|
readonly subagentModeAllowed?: boolean;
|
|
1872
|
+
/**
|
|
1873
|
+
* Ordered list of target names to try when the primary target fails after
|
|
1874
|
+
* exhausting retries. Each fallback is attempted in order.
|
|
1875
|
+
*/
|
|
1876
|
+
readonly fallbackTargets?: readonly string[];
|
|
1855
1877
|
}
|
|
1856
1878
|
type ResolvedTarget = (ResolvedTargetBase & {
|
|
1857
1879
|
readonly kind: 'openai';
|
|
@@ -1913,7 +1935,7 @@ type ResolvedTarget = (ResolvedTargetBase & {
|
|
|
1913
1935
|
* Exported so the targets validator can reuse the same list — adding a field
|
|
1914
1936
|
* here automatically makes it valid in targets.yaml without a separate update.
|
|
1915
1937
|
*/
|
|
1916
|
-
declare const COMMON_TARGET_SETTINGS: readonly ["provider_batching", "providerBatching", "subagent_mode_allowed", "subagentModeAllowed"];
|
|
1938
|
+
declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "providerBatching", "subagent_mode_allowed", "subagentModeAllowed", "fallback_targets", "fallbackTargets"];
|
|
1917
1939
|
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string): ResolvedTarget;
|
|
1918
1940
|
|
|
1919
1941
|
/**
|
|
@@ -3887,4 +3909,4 @@ type AgentKernel = {
|
|
|
3887
3909
|
};
|
|
3888
3910
|
declare function createAgentKernel(): AgentKernel;
|
|
3889
3911
|
|
|
3890
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, removeProject, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
3912
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, removeProject, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.js
CHANGED
|
@@ -23,7 +23,7 @@ import {
|
|
|
23
23
|
readTextFile,
|
|
24
24
|
resolveFileReference,
|
|
25
25
|
resolveTargetDefinition
|
|
26
|
-
} from "./chunk-
|
|
26
|
+
} from "./chunk-AIQ5FO4G.js";
|
|
27
27
|
import {
|
|
28
28
|
AgentvProvider
|
|
29
29
|
} from "./chunk-PRNXHNLF.js";
|
|
@@ -766,6 +766,7 @@ function validateTemplateVariables(content, source) {
|
|
|
766
766
|
// src/evaluation/loaders/evaluator-parser.ts
|
|
767
767
|
var ANSI_YELLOW3 = "\x1B[33m";
|
|
768
768
|
var ANSI_RESET4 = "\x1B[0m";
|
|
769
|
+
var PROMPT_FILE_PREFIX = "file://";
|
|
769
770
|
function normalizeEvaluatorType(type) {
|
|
770
771
|
return type.replace(/_/g, "-");
|
|
771
772
|
}
|
|
@@ -1064,12 +1065,23 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1064
1065
|
threshold: thresholdValue
|
|
1065
1066
|
};
|
|
1066
1067
|
} else {
|
|
1067
|
-
const
|
|
1068
|
+
const rawAggPrompt = asString(rawAggregator.prompt);
|
|
1069
|
+
let aggregatorPrompt;
|
|
1068
1070
|
let promptPath2;
|
|
1069
|
-
if (
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1071
|
+
if (rawAggPrompt) {
|
|
1072
|
+
if (rawAggPrompt.startsWith(PROMPT_FILE_PREFIX)) {
|
|
1073
|
+
const fileRef = rawAggPrompt.slice(PROMPT_FILE_PREFIX.length);
|
|
1074
|
+
aggregatorPrompt = fileRef;
|
|
1075
|
+
const resolved = await resolveFileReference2(fileRef, searchRoots);
|
|
1076
|
+
if (resolved.resolvedPath) {
|
|
1077
|
+
promptPath2 = path4.resolve(resolved.resolvedPath);
|
|
1078
|
+
} else {
|
|
1079
|
+
throw new Error(
|
|
1080
|
+
`Composite aggregator in '${evalId}': prompt file not found: ${resolved.displayPath}`
|
|
1081
|
+
);
|
|
1082
|
+
}
|
|
1083
|
+
} else {
|
|
1084
|
+
aggregatorPrompt = rawAggPrompt;
|
|
1073
1085
|
}
|
|
1074
1086
|
}
|
|
1075
1087
|
aggregator = {
|
|
@@ -1649,21 +1661,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1649
1661
|
promptScriptConfig = rawPrompt.config;
|
|
1650
1662
|
}
|
|
1651
1663
|
} else if (typeof rawPrompt === "string") {
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1664
|
+
if (rawPrompt.startsWith(PROMPT_FILE_PREFIX)) {
|
|
1665
|
+
const fileRef = rawPrompt.slice(PROMPT_FILE_PREFIX.length);
|
|
1666
|
+
prompt = fileRef;
|
|
1667
|
+
const resolved = await resolveFileReference2(fileRef, searchRoots);
|
|
1668
|
+
if (resolved.resolvedPath) {
|
|
1669
|
+
promptPath = path4.resolve(resolved.resolvedPath);
|
|
1670
|
+
try {
|
|
1671
|
+
await validateCustomPromptContent(promptPath);
|
|
1672
|
+
} catch (error) {
|
|
1673
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1674
|
+
throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
|
|
1675
|
+
}
|
|
1676
|
+
} else {
|
|
1677
|
+
throw new Error(
|
|
1678
|
+
`Evaluator '${name}' in '${evalId}': prompt file not found: ${resolved.displayPath}`
|
|
1679
|
+
);
|
|
1661
1680
|
}
|
|
1662
1681
|
} else {
|
|
1663
|
-
|
|
1664
|
-
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
1665
|
-
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
1666
|
-
);
|
|
1682
|
+
prompt = rawPrompt;
|
|
1667
1683
|
}
|
|
1668
1684
|
}
|
|
1669
1685
|
const _model = asString(rawEvaluator.model);
|
|
@@ -3572,7 +3588,7 @@ var OpenAIProvider = class {
|
|
|
3572
3588
|
apiKey: config.apiKey,
|
|
3573
3589
|
baseURL: config.baseURL
|
|
3574
3590
|
});
|
|
3575
|
-
this.model = openai(config.model);
|
|
3591
|
+
this.model = config.apiFormat === "responses" ? openai(config.model) : openai.chat(config.model);
|
|
3576
3592
|
}
|
|
3577
3593
|
id;
|
|
3578
3594
|
kind = "openai";
|
|
@@ -10082,8 +10098,11 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
10082
10098
|
`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`
|
|
10083
10099
|
);
|
|
10084
10100
|
}
|
|
10085
|
-
|
|
10086
|
-
|
|
10101
|
+
const hasUseTarget = typeof value.use_target === "string" && value.use_target.trim().length > 0;
|
|
10102
|
+
if (!hasUseTarget && (typeof provider !== "string" || provider.trim().length === 0)) {
|
|
10103
|
+
throw new Error(
|
|
10104
|
+
`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider' (or use use_target for delegation)`
|
|
10105
|
+
);
|
|
10087
10106
|
}
|
|
10088
10107
|
return value;
|
|
10089
10108
|
}
|
|
@@ -15490,10 +15509,20 @@ async function runEvaluation(options) {
|
|
|
15490
15509
|
if (resolvedTargetsByName.has(name)) {
|
|
15491
15510
|
return resolvedTargetsByName.get(name);
|
|
15492
15511
|
}
|
|
15493
|
-
|
|
15512
|
+
let definition = targetDefinitions.get(name);
|
|
15494
15513
|
if (!definition) {
|
|
15495
15514
|
return void 0;
|
|
15496
15515
|
}
|
|
15516
|
+
for (let depth = 0; depth < 5; depth++) {
|
|
15517
|
+
const useTarget = definition.use_target;
|
|
15518
|
+
if (typeof useTarget !== "string" || useTarget.trim().length === 0) break;
|
|
15519
|
+
const envMatch = useTarget.trim().match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
|
|
15520
|
+
const resolvedName = envMatch ? envLookup[envMatch[1]] ?? "" : useTarget.trim();
|
|
15521
|
+
if (resolvedName.length === 0) break;
|
|
15522
|
+
const next = targetDefinitions.get(resolvedName);
|
|
15523
|
+
if (!next) break;
|
|
15524
|
+
definition = next;
|
|
15525
|
+
}
|
|
15497
15526
|
const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
|
|
15498
15527
|
resolvedTargetsByName.set(name, resolved);
|
|
15499
15528
|
return resolved;
|
|
@@ -16498,6 +16527,7 @@ async function runEvalCase(options) {
|
|
|
16498
16527
|
let attempt = 0;
|
|
16499
16528
|
let providerResponse = cachedResponse;
|
|
16500
16529
|
let lastError;
|
|
16530
|
+
let targetUsed;
|
|
16501
16531
|
while (!providerResponse && attempt < attemptBudget) {
|
|
16502
16532
|
try {
|
|
16503
16533
|
providerResponse = await invokeProvider(provider, {
|
|
@@ -16520,25 +16550,33 @@ async function runEvalCase(options) {
|
|
|
16520
16550
|
attempt += 1;
|
|
16521
16551
|
continue;
|
|
16522
16552
|
}
|
|
16523
|
-
|
|
16524
|
-
|
|
16525
|
-
|
|
16526
|
-
|
|
16527
|
-
|
|
16528
|
-
|
|
16529
|
-
|
|
16530
|
-
|
|
16531
|
-
|
|
16532
|
-
|
|
16533
|
-
|
|
16534
|
-
|
|
16535
|
-
|
|
16536
|
-
|
|
16537
|
-
|
|
16538
|
-
|
|
16539
|
-
|
|
16553
|
+
break;
|
|
16554
|
+
}
|
|
16555
|
+
}
|
|
16556
|
+
if (!providerResponse && target.fallbackTargets?.length && targetResolver) {
|
|
16557
|
+
for (const fallbackName of target.fallbackTargets) {
|
|
16558
|
+
const fallbackProvider = targetResolver(fallbackName);
|
|
16559
|
+
if (!fallbackProvider) {
|
|
16560
|
+
continue;
|
|
16561
|
+
}
|
|
16562
|
+
try {
|
|
16563
|
+
providerResponse = await invokeProvider(fallbackProvider, {
|
|
16564
|
+
evalCase,
|
|
16565
|
+
target,
|
|
16566
|
+
promptInputs,
|
|
16567
|
+
attempt: 0,
|
|
16568
|
+
agentTimeoutMs,
|
|
16569
|
+
signal,
|
|
16570
|
+
cwd: workspacePath,
|
|
16571
|
+
workspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
|
|
16572
|
+
captureFileChanges: !!baselineCommit,
|
|
16573
|
+
streamCallbacks: options.streamCallbacks
|
|
16574
|
+
});
|
|
16575
|
+
targetUsed = fallbackName;
|
|
16576
|
+
break;
|
|
16577
|
+
} catch (error) {
|
|
16578
|
+
lastError = error;
|
|
16540
16579
|
}
|
|
16541
|
-
return errorResult;
|
|
16542
16580
|
}
|
|
16543
16581
|
}
|
|
16544
16582
|
if (!providerResponse) {
|
|
@@ -16664,8 +16702,10 @@ async function runEvalCase(options) {
|
|
|
16664
16702
|
};
|
|
16665
16703
|
const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
|
|
16666
16704
|
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, caseThreshold);
|
|
16705
|
+
const targetUsedField = targetUsed ? { targetUsed } : {};
|
|
16667
16706
|
const finalResult = providerError ? {
|
|
16668
16707
|
...result,
|
|
16708
|
+
...targetUsedField,
|
|
16669
16709
|
evalRun,
|
|
16670
16710
|
error: providerError,
|
|
16671
16711
|
executionStatus,
|
|
@@ -16677,6 +16717,7 @@ async function runEvalCase(options) {
|
|
|
16677
16717
|
afterEachOutput
|
|
16678
16718
|
} : skippedEvaluatorError ? {
|
|
16679
16719
|
...result,
|
|
16720
|
+
...targetUsedField,
|
|
16680
16721
|
score: 0,
|
|
16681
16722
|
evalRun,
|
|
16682
16723
|
error: skippedEvaluatorError,
|
|
@@ -16689,6 +16730,7 @@ async function runEvalCase(options) {
|
|
|
16689
16730
|
afterEachOutput
|
|
16690
16731
|
} : {
|
|
16691
16732
|
...result,
|
|
16733
|
+
...targetUsedField,
|
|
16692
16734
|
evalRun,
|
|
16693
16735
|
executionStatus,
|
|
16694
16736
|
beforeAllOutput,
|