@agentv/core 4.6.0 → 4.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-AIQ5FO4G.js → chunk-ZK4GG7PR.js} +61 -1
- package/dist/chunk-ZK4GG7PR.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +225 -60
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -1
- package/dist/index.d.ts +4 -1
- package/dist/index.js +174 -69
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-AIQ5FO4G.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1804,6 +1804,7 @@ interface PiCodingAgentResolvedConfig {
|
|
|
1804
1804
|
readonly subprovider?: string;
|
|
1805
1805
|
readonly model?: string;
|
|
1806
1806
|
readonly apiKey?: string;
|
|
1807
|
+
readonly baseUrl?: string;
|
|
1807
1808
|
readonly tools?: string;
|
|
1808
1809
|
readonly thinking?: string;
|
|
1809
1810
|
readonly cwd?: string;
|
|
@@ -1818,6 +1819,7 @@ interface PiCliResolvedConfig {
|
|
|
1818
1819
|
readonly subprovider?: string;
|
|
1819
1820
|
readonly model?: string;
|
|
1820
1821
|
readonly apiKey?: string;
|
|
1822
|
+
readonly baseUrl?: string;
|
|
1821
1823
|
readonly tools?: string;
|
|
1822
1824
|
readonly thinking?: string;
|
|
1823
1825
|
readonly args?: readonly string[];
|
|
@@ -1936,6 +1938,7 @@ type ResolvedTarget = (ResolvedTargetBase & {
|
|
|
1936
1938
|
* here automatically makes it valid in targets.yaml without a separate update.
|
|
1937
1939
|
*/
|
|
1938
1940
|
declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "providerBatching", "subagent_mode_allowed", "subagentModeAllowed", "fallback_targets", "fallbackTargets"];
|
|
1941
|
+
declare function resolveDelegatedTargetDefinition(name: string, definitions: ReadonlyMap<string, TargetDefinition>, env?: EnvLookup): TargetDefinition | undefined;
|
|
1939
1942
|
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string): ResolvedTarget;
|
|
1940
1943
|
|
|
1941
1944
|
/**
|
|
@@ -3909,4 +3912,4 @@ type AgentKernel = {
|
|
|
3909
3912
|
};
|
|
3910
3913
|
declare function createAgentKernel(): AgentKernel;
|
|
3911
3914
|
|
|
3912
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, removeProject, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
3915
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -1804,6 +1804,7 @@ interface PiCodingAgentResolvedConfig {
|
|
|
1804
1804
|
readonly subprovider?: string;
|
|
1805
1805
|
readonly model?: string;
|
|
1806
1806
|
readonly apiKey?: string;
|
|
1807
|
+
readonly baseUrl?: string;
|
|
1807
1808
|
readonly tools?: string;
|
|
1808
1809
|
readonly thinking?: string;
|
|
1809
1810
|
readonly cwd?: string;
|
|
@@ -1818,6 +1819,7 @@ interface PiCliResolvedConfig {
|
|
|
1818
1819
|
readonly subprovider?: string;
|
|
1819
1820
|
readonly model?: string;
|
|
1820
1821
|
readonly apiKey?: string;
|
|
1822
|
+
readonly baseUrl?: string;
|
|
1821
1823
|
readonly tools?: string;
|
|
1822
1824
|
readonly thinking?: string;
|
|
1823
1825
|
readonly args?: readonly string[];
|
|
@@ -1936,6 +1938,7 @@ type ResolvedTarget = (ResolvedTargetBase & {
|
|
|
1936
1938
|
* here automatically makes it valid in targets.yaml without a separate update.
|
|
1937
1939
|
*/
|
|
1938
1940
|
declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "providerBatching", "subagent_mode_allowed", "subagentModeAllowed", "fallback_targets", "fallbackTargets"];
|
|
1941
|
+
declare function resolveDelegatedTargetDefinition(name: string, definitions: ReadonlyMap<string, TargetDefinition>, env?: EnvLookup): TargetDefinition | undefined;
|
|
1939
1942
|
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string): ResolvedTarget;
|
|
1940
1943
|
|
|
1941
1944
|
/**
|
|
@@ -3909,4 +3912,4 @@ type AgentKernel = {
|
|
|
3909
3912
|
};
|
|
3910
3913
|
declare function createAgentKernel(): AgentKernel;
|
|
3911
3914
|
|
|
3912
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, removeProject, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
3915
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.js
CHANGED
|
@@ -21,9 +21,10 @@ import {
|
|
|
21
21
|
normalizeLineEndings,
|
|
22
22
|
readJsonFile,
|
|
23
23
|
readTextFile,
|
|
24
|
+
resolveDelegatedTargetDefinition,
|
|
24
25
|
resolveFileReference,
|
|
25
26
|
resolveTargetDefinition
|
|
26
|
-
} from "./chunk-
|
|
27
|
+
} from "./chunk-ZK4GG7PR.js";
|
|
27
28
|
import {
|
|
28
29
|
AgentvProvider
|
|
29
30
|
} from "./chunk-PRNXHNLF.js";
|
|
@@ -5216,15 +5217,16 @@ var CliProvider = class {
|
|
|
5216
5217
|
outputFilePath
|
|
5217
5218
|
);
|
|
5218
5219
|
const renderedCommand = renderTemplate(this.config.command, templateValues);
|
|
5220
|
+
const effectiveCwd = requests[0]?.cwd ?? this.config.cwd;
|
|
5219
5221
|
if (this.verbose) {
|
|
5220
5222
|
console.log(
|
|
5221
|
-
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${
|
|
5223
|
+
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${effectiveCwd ?? ""} command=${renderedCommand}`
|
|
5222
5224
|
);
|
|
5223
5225
|
}
|
|
5224
5226
|
try {
|
|
5225
5227
|
const startTime = Date.now();
|
|
5226
5228
|
const result = await this.runCommand(renderedCommand, {
|
|
5227
|
-
cwd:
|
|
5229
|
+
cwd: effectiveCwd,
|
|
5228
5230
|
env: process.env,
|
|
5229
5231
|
timeoutMs: this.config.timeoutMs,
|
|
5230
5232
|
signal: controller.signal
|
|
@@ -5257,7 +5259,7 @@ var CliProvider = class {
|
|
|
5257
5259
|
command: renderedCommand,
|
|
5258
5260
|
stderr: result.stderr,
|
|
5259
5261
|
exitCode: result.exitCode ?? 0,
|
|
5260
|
-
cwd:
|
|
5262
|
+
cwd: effectiveCwd,
|
|
5261
5263
|
outputFile: outputFilePath
|
|
5262
5264
|
}
|
|
5263
5265
|
};
|
|
@@ -5275,7 +5277,7 @@ var CliProvider = class {
|
|
|
5275
5277
|
command: renderedCommand,
|
|
5276
5278
|
stderr: result.stderr,
|
|
5277
5279
|
exitCode: result.exitCode ?? 0,
|
|
5278
|
-
cwd:
|
|
5280
|
+
cwd: effectiveCwd,
|
|
5279
5281
|
outputFile: outputFilePath,
|
|
5280
5282
|
error: errorMessage
|
|
5281
5283
|
}
|
|
@@ -5290,7 +5292,7 @@ var CliProvider = class {
|
|
|
5290
5292
|
command: renderedCommand,
|
|
5291
5293
|
stderr: result.stderr,
|
|
5292
5294
|
exitCode: result.exitCode ?? 0,
|
|
5293
|
-
cwd:
|
|
5295
|
+
cwd: effectiveCwd,
|
|
5294
5296
|
outputFile: outputFilePath,
|
|
5295
5297
|
recordId: evalCaseId
|
|
5296
5298
|
}
|
|
@@ -7240,9 +7242,9 @@ var MockProvider = class {
|
|
|
7240
7242
|
};
|
|
7241
7243
|
|
|
7242
7244
|
// src/evaluation/providers/pi-cli.ts
|
|
7243
|
-
import { spawn as spawn3 } from "node:child_process";
|
|
7245
|
+
import { execSync, spawn as spawn3 } from "node:child_process";
|
|
7244
7246
|
import { randomUUID as randomUUID7 } from "node:crypto";
|
|
7245
|
-
import { createWriteStream as createWriteStream5 } from "node:fs";
|
|
7247
|
+
import { accessSync, createWriteStream as createWriteStream5, readFileSync as readFileSync2 } from "node:fs";
|
|
7246
7248
|
import { mkdir as mkdir6, mkdtemp, rm, writeFile } from "node:fs/promises";
|
|
7247
7249
|
import { tmpdir } from "node:os";
|
|
7248
7250
|
import path19 from "node:path";
|
|
@@ -7300,6 +7302,59 @@ function subscribeToPiLogEntries(listener) {
|
|
|
7300
7302
|
};
|
|
7301
7303
|
}
|
|
7302
7304
|
|
|
7305
|
+
// src/evaluation/providers/pi-provider-aliases.ts
|
|
7306
|
+
var SUBPROVIDER_ALIASES = {
|
|
7307
|
+
azure: "azure-openai-responses"
|
|
7308
|
+
};
|
|
7309
|
+
var SUBPROVIDER_ALIASES_WITH_BASE_URL = {
|
|
7310
|
+
// Azure v1 endpoints are OpenAI-compatible; use the standard client
|
|
7311
|
+
// to avoid AzureOpenAI adding api-version query params.
|
|
7312
|
+
azure: "openai-responses"
|
|
7313
|
+
};
|
|
7314
|
+
var ENV_KEY_MAP = {
|
|
7315
|
+
google: "GEMINI_API_KEY",
|
|
7316
|
+
gemini: "GEMINI_API_KEY",
|
|
7317
|
+
anthropic: "ANTHROPIC_API_KEY",
|
|
7318
|
+
openai: "OPENAI_API_KEY",
|
|
7319
|
+
groq: "GROQ_API_KEY",
|
|
7320
|
+
xai: "XAI_API_KEY",
|
|
7321
|
+
openrouter: "OPENROUTER_API_KEY",
|
|
7322
|
+
azure: "AZURE_OPENAI_API_KEY"
|
|
7323
|
+
};
|
|
7324
|
+
var ENV_BASE_URL_MAP = {
|
|
7325
|
+
openai: "OPENAI_BASE_URL",
|
|
7326
|
+
azure: "AZURE_OPENAI_BASE_URL",
|
|
7327
|
+
openrouter: "OPENROUTER_BASE_URL"
|
|
7328
|
+
};
|
|
7329
|
+
function resolveSubprovider(name, hasBaseUrl = false) {
|
|
7330
|
+
const lower = name.toLowerCase();
|
|
7331
|
+
if (hasBaseUrl) {
|
|
7332
|
+
const alias = SUBPROVIDER_ALIASES_WITH_BASE_URL[lower];
|
|
7333
|
+
if (alias) return alias;
|
|
7334
|
+
}
|
|
7335
|
+
return SUBPROVIDER_ALIASES[lower] ?? name;
|
|
7336
|
+
}
|
|
7337
|
+
function resolveCliProvider(name) {
|
|
7338
|
+
const lower = name.toLowerCase();
|
|
7339
|
+
if (lower === "azure") return "azure-openai-responses";
|
|
7340
|
+
return name;
|
|
7341
|
+
}
|
|
7342
|
+
function resolveEnvKeyName(provider, hasBaseUrl = false) {
|
|
7343
|
+
const lower = provider.toLowerCase();
|
|
7344
|
+
if (hasBaseUrl && lower === "azure") return "OPENAI_API_KEY";
|
|
7345
|
+
return ENV_KEY_MAP[lower];
|
|
7346
|
+
}
|
|
7347
|
+
function resolveEnvBaseUrlName(provider, hasBaseUrl = false) {
|
|
7348
|
+
const lower = provider.toLowerCase();
|
|
7349
|
+
if (hasBaseUrl && lower === "azure") return "OPENAI_BASE_URL";
|
|
7350
|
+
return ENV_BASE_URL_MAP[lower];
|
|
7351
|
+
}
|
|
7352
|
+
function extractAzureResourceName(baseUrl) {
|
|
7353
|
+
const urlMatch = baseUrl.match(/^https?:\/\/([^./]+)/);
|
|
7354
|
+
if (urlMatch) return urlMatch[1];
|
|
7355
|
+
return baseUrl;
|
|
7356
|
+
}
|
|
7357
|
+
|
|
7303
7358
|
// src/evaluation/providers/pi-utils.ts
|
|
7304
7359
|
function extractPiTextContent(content) {
|
|
7305
7360
|
if (typeof content === "string") {
|
|
@@ -7458,12 +7513,12 @@ var PiCliProvider = class {
|
|
|
7458
7513
|
buildPiArgs(prompt, inputFiles) {
|
|
7459
7514
|
const args = [];
|
|
7460
7515
|
if (this.config.subprovider) {
|
|
7461
|
-
args.push("--provider", this.config.subprovider);
|
|
7516
|
+
args.push("--provider", resolveCliProvider(this.config.subprovider));
|
|
7462
7517
|
}
|
|
7463
7518
|
if (this.config.model) {
|
|
7464
7519
|
args.push("--model", this.config.model);
|
|
7465
7520
|
}
|
|
7466
|
-
if (this.config.apiKey) {
|
|
7521
|
+
if (this.config.apiKey && this.config.subprovider?.toLowerCase() !== "azure") {
|
|
7467
7522
|
args.push("--api-key", this.config.apiKey);
|
|
7468
7523
|
}
|
|
7469
7524
|
args.push("--mode", "json");
|
|
@@ -7515,35 +7570,35 @@ ${prompt}` : prompt;
|
|
|
7515
7570
|
}
|
|
7516
7571
|
buildEnv() {
|
|
7517
7572
|
const env = { ...process.env };
|
|
7518
|
-
|
|
7519
|
-
|
|
7520
|
-
|
|
7521
|
-
|
|
7522
|
-
|
|
7523
|
-
|
|
7524
|
-
|
|
7525
|
-
|
|
7526
|
-
|
|
7527
|
-
|
|
7528
|
-
|
|
7529
|
-
|
|
7530
|
-
|
|
7531
|
-
|
|
7573
|
+
const provider = this.config.subprovider?.toLowerCase() ?? "google";
|
|
7574
|
+
if (provider === "azure") {
|
|
7575
|
+
if (this.config.apiKey) {
|
|
7576
|
+
env.AZURE_OPENAI_API_KEY = this.config.apiKey;
|
|
7577
|
+
}
|
|
7578
|
+
if (this.config.baseUrl) {
|
|
7579
|
+
env.AZURE_OPENAI_RESOURCE_NAME = extractAzureResourceName(this.config.baseUrl);
|
|
7580
|
+
}
|
|
7581
|
+
} else {
|
|
7582
|
+
if (this.config.apiKey) {
|
|
7583
|
+
const envKey = resolveEnvKeyName(provider);
|
|
7584
|
+
if (envKey) {
|
|
7585
|
+
env[envKey] = this.config.apiKey;
|
|
7586
|
+
}
|
|
7532
7587
|
}
|
|
7533
7588
|
}
|
|
7534
7589
|
if (this.config.subprovider) {
|
|
7535
|
-
const
|
|
7590
|
+
const resolvedProvider = resolveCliProvider(this.config.subprovider);
|
|
7536
7591
|
const PROVIDER_OWN_PREFIXES = {
|
|
7537
7592
|
openrouter: ["OPENROUTER_"],
|
|
7538
7593
|
anthropic: ["ANTHROPIC_"],
|
|
7539
7594
|
openai: ["OPENAI_"],
|
|
7540
|
-
azure: ["AZURE_OPENAI_"],
|
|
7595
|
+
"azure-openai-responses": ["AZURE_OPENAI_"],
|
|
7541
7596
|
google: ["GEMINI_", "GOOGLE_GENERATIVE_AI_"],
|
|
7542
7597
|
gemini: ["GEMINI_", "GOOGLE_GENERATIVE_AI_"],
|
|
7543
7598
|
groq: ["GROQ_"],
|
|
7544
7599
|
xai: ["XAI_"]
|
|
7545
7600
|
};
|
|
7546
|
-
const ownPrefixes = PROVIDER_OWN_PREFIXES[
|
|
7601
|
+
const ownPrefixes = PROVIDER_OWN_PREFIXES[resolvedProvider] ?? [];
|
|
7547
7602
|
const allOtherPrefixes = Object.entries(PROVIDER_OWN_PREFIXES).filter(([key]) => key !== provider).flatMap(([, prefixes]) => prefixes);
|
|
7548
7603
|
for (const key of Object.keys(env)) {
|
|
7549
7604
|
if (allOtherPrefixes.some((prefix) => key.startsWith(prefix)) && !ownPrefixes.some((prefix) => key.startsWith(prefix))) {
|
|
@@ -7834,6 +7889,24 @@ function extractMessages(events) {
|
|
|
7834
7889
|
}
|
|
7835
7890
|
}
|
|
7836
7891
|
}
|
|
7892
|
+
if (messages) {
|
|
7893
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
7894
|
+
if (messages[i].role === "assistant" && !messages[i].content) {
|
|
7895
|
+
for (let j = events.length - 1; j >= 0; j--) {
|
|
7896
|
+
const evt = events[j];
|
|
7897
|
+
if (!evt || evt.type !== "message_end") continue;
|
|
7898
|
+
const msg = evt.message;
|
|
7899
|
+
if (msg?.role !== "assistant") continue;
|
|
7900
|
+
const text = extractPiTextContent(msg.content);
|
|
7901
|
+
if (text) {
|
|
7902
|
+
messages[i] = { ...messages[i], content: text };
|
|
7903
|
+
break;
|
|
7904
|
+
}
|
|
7905
|
+
}
|
|
7906
|
+
break;
|
|
7907
|
+
}
|
|
7908
|
+
}
|
|
7909
|
+
}
|
|
7837
7910
|
const eventToolCalls = extractToolCallsFromEvents(events);
|
|
7838
7911
|
if (eventToolCalls.length > 0) {
|
|
7839
7912
|
injectEventToolCalls(messages, eventToolCalls);
|
|
@@ -8018,17 +8091,43 @@ function formatTimeoutSuffix3(timeoutMs) {
|
|
|
8018
8091
|
if (!timeoutMs || timeoutMs <= 0) return "";
|
|
8019
8092
|
return ` after ${Math.ceil(timeoutMs / 1e3)}s`;
|
|
8020
8093
|
}
|
|
8094
|
+
function resolveWindowsCmd(executable) {
|
|
8095
|
+
if (process.platform !== "win32") return [executable, []];
|
|
8096
|
+
const lower = executable.toLowerCase();
|
|
8097
|
+
if (lower.endsWith(".js") || lower.endsWith(".exe")) return [executable, []];
|
|
8098
|
+
let fullPath;
|
|
8099
|
+
try {
|
|
8100
|
+
fullPath = execSync(`where ${executable}`, { encoding: "utf-8" }).trim().split(/\r?\n/)[0].trim();
|
|
8101
|
+
} catch {
|
|
8102
|
+
return [executable, []];
|
|
8103
|
+
}
|
|
8104
|
+
const cmdPath = fullPath.endsWith(".cmd") ? fullPath : `${fullPath}.cmd`;
|
|
8105
|
+
try {
|
|
8106
|
+
const content = readFileSync2(cmdPath, "utf-8");
|
|
8107
|
+
const match = content.match(/"?%_prog%"?\s+"([^"]+\.js)"/);
|
|
8108
|
+
if (match) {
|
|
8109
|
+
const dp0 = path19.dirname(path19.resolve(cmdPath));
|
|
8110
|
+
const scriptPath = match[1].replace(/%dp0%[/\\]?/gi, `${dp0}${path19.sep}`);
|
|
8111
|
+
try {
|
|
8112
|
+
accessSync(scriptPath);
|
|
8113
|
+
return ["node", [scriptPath]];
|
|
8114
|
+
} catch {
|
|
8115
|
+
}
|
|
8116
|
+
}
|
|
8117
|
+
} catch {
|
|
8118
|
+
}
|
|
8119
|
+
return [executable, []];
|
|
8120
|
+
}
|
|
8021
8121
|
async function defaultPiRunner(options) {
|
|
8022
8122
|
return await new Promise((resolve, reject) => {
|
|
8023
8123
|
const parts = options.executable.split(/\s+/);
|
|
8024
|
-
const
|
|
8025
|
-
const executableArgs = parts.slice(1);
|
|
8124
|
+
const [resolvedExe, prefixArgs] = resolveWindowsCmd(parts[0]);
|
|
8125
|
+
const executableArgs = [...prefixArgs, ...parts.slice(1)];
|
|
8026
8126
|
const allArgs = [...executableArgs, ...options.args];
|
|
8027
|
-
const child = spawn3(
|
|
8127
|
+
const child = spawn3(resolvedExe, allArgs, {
|
|
8028
8128
|
cwd: options.cwd,
|
|
8029
8129
|
env: options.env,
|
|
8030
|
-
stdio: ["pipe", "pipe", "pipe"]
|
|
8031
|
-
shell: false
|
|
8130
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
8032
8131
|
});
|
|
8033
8132
|
let stdout = "";
|
|
8034
8133
|
let stderr = "";
|
|
@@ -8083,9 +8182,9 @@ async function defaultPiRunner(options) {
|
|
|
8083
8182
|
}
|
|
8084
8183
|
|
|
8085
8184
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
8086
|
-
import { execSync } from "node:child_process";
|
|
8185
|
+
import { execSync as execSync2 } from "node:child_process";
|
|
8087
8186
|
import { randomUUID as randomUUID8 } from "node:crypto";
|
|
8088
|
-
import { accessSync, createWriteStream as createWriteStream6 } from "node:fs";
|
|
8187
|
+
import { accessSync as accessSync2, createWriteStream as createWriteStream6 } from "node:fs";
|
|
8089
8188
|
import { mkdir as mkdir7 } from "node:fs/promises";
|
|
8090
8189
|
import path20 from "node:path";
|
|
8091
8190
|
import { createInterface } from "node:readline";
|
|
@@ -8113,7 +8212,7 @@ function findAgentvRoot() {
|
|
|
8113
8212
|
for (let i = 0; i < 10; i++) {
|
|
8114
8213
|
try {
|
|
8115
8214
|
const pkg = path20.join(dir, "package.json");
|
|
8116
|
-
|
|
8215
|
+
accessSync2(pkg);
|
|
8117
8216
|
return dir;
|
|
8118
8217
|
} catch {
|
|
8119
8218
|
const parent = path20.dirname(dir);
|
|
@@ -8133,7 +8232,7 @@ async function doLoadSdkModules() {
|
|
|
8133
8232
|
if (await promptInstall()) {
|
|
8134
8233
|
const installDir = findAgentvRoot();
|
|
8135
8234
|
console.error(`Installing @mariozechner/pi-coding-agent into ${installDir}...`);
|
|
8136
|
-
|
|
8235
|
+
execSync2("bun add @mariozechner/pi-coding-agent", {
|
|
8137
8236
|
cwd: installDir,
|
|
8138
8237
|
stdio: "inherit"
|
|
8139
8238
|
});
|
|
@@ -8174,7 +8273,9 @@ async function loadSdkModules() {
|
|
|
8174
8273
|
codingTools: piSdk.codingTools,
|
|
8175
8274
|
toolMap,
|
|
8176
8275
|
SessionManager: piSdk.SessionManager,
|
|
8177
|
-
getModel: piAi.getModel
|
|
8276
|
+
getModel: piAi.getModel,
|
|
8277
|
+
// biome-ignore lint/suspicious/noExplicitAny: registerBuiltInApiProviders exists at runtime but not in type defs
|
|
8278
|
+
registerBuiltInApiProviders: piAi.registerBuiltInApiProviders
|
|
8178
8279
|
};
|
|
8179
8280
|
}
|
|
8180
8281
|
var PiCodingAgentProvider = class {
|
|
@@ -8196,17 +8297,31 @@ var PiCodingAgentProvider = class {
|
|
|
8196
8297
|
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
8197
8298
|
const startMs = Date.now();
|
|
8198
8299
|
const sdk = await loadSdkModules();
|
|
8300
|
+
sdk.registerBuiltInApiProviders();
|
|
8199
8301
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
8200
8302
|
try {
|
|
8201
8303
|
const cwd = this.resolveCwd(request.cwd);
|
|
8202
|
-
const
|
|
8304
|
+
const rawProvider = this.config.subprovider ?? "google";
|
|
8305
|
+
const hasBaseUrl = !!this.config.baseUrl;
|
|
8306
|
+
const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
|
|
8203
8307
|
const modelId = this.config.model ?? "gemini-2.5-flash";
|
|
8204
|
-
this.setApiKeyEnv(
|
|
8205
|
-
|
|
8308
|
+
this.setApiKeyEnv(rawProvider, hasBaseUrl);
|
|
8309
|
+
this.setBaseUrlEnv(rawProvider, hasBaseUrl);
|
|
8310
|
+
let model = sdk.getModel(providerName, modelId);
|
|
8206
8311
|
if (!model) {
|
|
8207
|
-
|
|
8208
|
-
|
|
8209
|
-
|
|
8312
|
+
const envProvider = providerName.replace(/-responses$/, "");
|
|
8313
|
+
model = {
|
|
8314
|
+
id: modelId,
|
|
8315
|
+
name: modelId,
|
|
8316
|
+
api: providerName,
|
|
8317
|
+
provider: envProvider,
|
|
8318
|
+
baseUrl: this.config.baseUrl ?? "",
|
|
8319
|
+
reasoning: false,
|
|
8320
|
+
input: ["text"],
|
|
8321
|
+
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
|
8322
|
+
contextWindow: 128e3,
|
|
8323
|
+
maxTokens: 16384
|
|
8324
|
+
};
|
|
8210
8325
|
}
|
|
8211
8326
|
const tools = this.resolveTools(sdk);
|
|
8212
8327
|
const { session } = await sdk.createAgentSession({
|
|
@@ -8359,22 +8474,21 @@ ${fileList}`;
|
|
|
8359
8474
|
}
|
|
8360
8475
|
}
|
|
8361
8476
|
/** Maps config apiKey to the provider-specific env var the SDK reads. */
|
|
8362
|
-
setApiKeyEnv(providerName) {
|
|
8477
|
+
setApiKeyEnv(providerName, hasBaseUrl = false) {
|
|
8363
8478
|
if (!this.config.apiKey) return;
|
|
8364
|
-
const
|
|
8365
|
-
google: "GEMINI_API_KEY",
|
|
8366
|
-
gemini: "GEMINI_API_KEY",
|
|
8367
|
-
anthropic: "ANTHROPIC_API_KEY",
|
|
8368
|
-
openai: "OPENAI_API_KEY",
|
|
8369
|
-
groq: "GROQ_API_KEY",
|
|
8370
|
-
xai: "XAI_API_KEY",
|
|
8371
|
-
openrouter: "OPENROUTER_API_KEY"
|
|
8372
|
-
};
|
|
8373
|
-
const envKey = ENV_KEY_MAP[providerName.toLowerCase()];
|
|
8479
|
+
const envKey = resolveEnvKeyName(providerName, hasBaseUrl);
|
|
8374
8480
|
if (envKey) {
|
|
8375
8481
|
process.env[envKey] = this.config.apiKey;
|
|
8376
8482
|
}
|
|
8377
8483
|
}
|
|
8484
|
+
/** Maps config baseUrl to the provider-specific env var the SDK reads. */
|
|
8485
|
+
setBaseUrlEnv(providerName, hasBaseUrl = false) {
|
|
8486
|
+
if (!this.config.baseUrl) return;
|
|
8487
|
+
const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
|
|
8488
|
+
if (envKey) {
|
|
8489
|
+
process.env[envKey] = this.config.baseUrl;
|
|
8490
|
+
}
|
|
8491
|
+
}
|
|
8378
8492
|
resolveCwd(cwdOverride) {
|
|
8379
8493
|
if (cwdOverride) {
|
|
8380
8494
|
return path20.resolve(cwdOverride);
|
|
@@ -15509,20 +15623,10 @@ async function runEvaluation(options) {
|
|
|
15509
15623
|
if (resolvedTargetsByName.has(name)) {
|
|
15510
15624
|
return resolvedTargetsByName.get(name);
|
|
15511
15625
|
}
|
|
15512
|
-
|
|
15626
|
+
const definition = resolveDelegatedTargetDefinition(name, targetDefinitions, envLookup);
|
|
15513
15627
|
if (!definition) {
|
|
15514
15628
|
return void 0;
|
|
15515
15629
|
}
|
|
15516
|
-
for (let depth = 0; depth < 5; depth++) {
|
|
15517
|
-
const useTarget = definition.use_target;
|
|
15518
|
-
if (typeof useTarget !== "string" || useTarget.trim().length === 0) break;
|
|
15519
|
-
const envMatch = useTarget.trim().match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
|
|
15520
|
-
const resolvedName = envMatch ? envLookup[envMatch[1]] ?? "" : useTarget.trim();
|
|
15521
|
-
if (resolvedName.length === 0) break;
|
|
15522
|
-
const next = targetDefinitions.get(resolvedName);
|
|
15523
|
-
if (!next) break;
|
|
15524
|
-
definition = next;
|
|
15525
|
-
}
|
|
15526
15630
|
const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
|
|
15527
15631
|
resolvedTargetsByName.set(name, resolved);
|
|
15528
15632
|
return resolved;
|
|
@@ -17608,7 +17712,7 @@ async function discoverDefaultTarget(repoRoot) {
|
|
|
17608
17712
|
return null;
|
|
17609
17713
|
}
|
|
17610
17714
|
async function loadEnvHierarchy(repoRoot, startPath) {
|
|
17611
|
-
const { readFileSync:
|
|
17715
|
+
const { readFileSync: readFileSync4 } = await import("node:fs");
|
|
17612
17716
|
const chain = buildDirectoryChain(startPath, repoRoot);
|
|
17613
17717
|
const envFiles = [];
|
|
17614
17718
|
for (const dir of chain) {
|
|
@@ -17617,7 +17721,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
|
|
|
17617
17721
|
}
|
|
17618
17722
|
for (let i = 0; i < envFiles.length; i++) {
|
|
17619
17723
|
try {
|
|
17620
|
-
const content =
|
|
17724
|
+
const content = readFileSync4(envFiles[i], "utf8");
|
|
17621
17725
|
for (const line of content.split("\n")) {
|
|
17622
17726
|
const trimmed = line.trim();
|
|
17623
17727
|
if (!trimmed || trimmed.startsWith("#")) continue;
|
|
@@ -17832,7 +17936,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
17832
17936
|
}
|
|
17833
17937
|
|
|
17834
17938
|
// src/projects.ts
|
|
17835
|
-
import { existsSync as existsSync6, mkdirSync, readFileSync as
|
|
17939
|
+
import { existsSync as existsSync6, mkdirSync, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
|
|
17836
17940
|
import path47 from "node:path";
|
|
17837
17941
|
import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
|
|
17838
17942
|
function getProjectsRegistryPath() {
|
|
@@ -17844,7 +17948,7 @@ function loadProjectRegistry() {
|
|
|
17844
17948
|
return { projects: [] };
|
|
17845
17949
|
}
|
|
17846
17950
|
try {
|
|
17847
|
-
const raw =
|
|
17951
|
+
const raw = readFileSync3(registryPath, "utf-8");
|
|
17848
17952
|
const parsed = parseYaml3(raw);
|
|
17849
17953
|
if (!parsed || !Array.isArray(parsed.projects)) {
|
|
17850
17954
|
return { projects: [] };
|
|
@@ -18881,6 +18985,7 @@ export {
|
|
|
18881
18985
|
readTranscriptFile,
|
|
18882
18986
|
removeProject,
|
|
18883
18987
|
resolveAndCreateProvider,
|
|
18988
|
+
resolveDelegatedTargetDefinition,
|
|
18884
18989
|
resolveFileReference,
|
|
18885
18990
|
resolveTargetDefinition,
|
|
18886
18991
|
resolveWorkspaceTemplate,
|