@agentv/core 4.25.1 → 4.25.2-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agentv-provider-MUIGGIP3.js +7 -0
- package/dist/chunk-5XV3FAAD.js +616 -0
- package/dist/chunk-5XV3FAAD.js.map +1 -0
- package/dist/{chunk-6HLBKYE2.js → chunk-CALQDF2Y.js} +1 -1
- package/dist/chunk-CALQDF2Y.js.map +1 -0
- package/dist/{chunk-IXTJEXWN.js → chunk-F234XBWV.js} +185 -551
- package/dist/chunk-F234XBWV.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +589 -419
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +78 -8
- package/dist/index.d.ts +78 -8
- package/dist/index.js +7 -12
- package/dist/index.js.map +1 -1
- package/dist/ts-eval-loader-5JMF2N65.js +12 -0
- package/package.json +2 -7
- package/dist/agentv-provider-TXM4UEUT.js +0 -7
- package/dist/chunk-6HLBKYE2.js.map +0 -1
- package/dist/chunk-IXTJEXWN.js.map +0 -1
- package/dist/chunk-PRNXHNLF.js +0 -65
- package/dist/chunk-PRNXHNLF.js.map +0 -1
- package/dist/ts-eval-loader-4CFPGHGT.js +0 -12
- /package/dist/{agentv-provider-TXM4UEUT.js.map → agentv-provider-MUIGGIP3.js.map} +0 -0
- /package/dist/{ts-eval-loader-4CFPGHGT.js.map → ts-eval-loader-5JMF2N65.js.map} +0 -0
package/dist/index.d.cts
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import * as ai from 'ai';
|
|
2
1
|
import { z } from 'zod';
|
|
3
2
|
|
|
4
3
|
/**
|
|
@@ -85,6 +84,25 @@ interface ProviderStreamCallbacks {
|
|
|
85
84
|
rootSpanId: string;
|
|
86
85
|
} | null;
|
|
87
86
|
}
|
|
87
|
+
/**
|
|
88
|
+
* A tool the model may call during multi-step provider execution. Pi-ai-shaped:
|
|
89
|
+
* the parameter shape is JSON Schema (provider-library-neutral wire format),
|
|
90
|
+
* and execute() is invoked by the provider once the model emits a tool call.
|
|
91
|
+
*/
|
|
92
|
+
interface ProviderTool {
|
|
93
|
+
/** Tool name as shown to the model. */
|
|
94
|
+
readonly name: string;
|
|
95
|
+
/** Tool description as shown to the model. */
|
|
96
|
+
readonly description: string;
|
|
97
|
+
/** JSON Schema for the tool's input. */
|
|
98
|
+
readonly parameters: JsonObject;
|
|
99
|
+
/**
|
|
100
|
+
* Executes the tool. Receives the parsed input the model produced. Errors
|
|
101
|
+
* are caught and surfaced to the model as tool-error results; the loop
|
|
102
|
+
* continues unless `maxSteps` is reached.
|
|
103
|
+
*/
|
|
104
|
+
execute(input: unknown): Promise<unknown> | unknown;
|
|
105
|
+
}
|
|
88
106
|
interface ProviderRequest {
|
|
89
107
|
readonly question: string;
|
|
90
108
|
readonly systemPrompt?: string;
|
|
@@ -109,6 +127,24 @@ interface ProviderRequest {
|
|
|
109
127
|
readonly parentSpanId: string;
|
|
110
128
|
readonly rootSpanId: string;
|
|
111
129
|
};
|
|
130
|
+
/**
|
|
131
|
+
* Tools the model may call. When provided, the provider runs the agent loop:
|
|
132
|
+
* model turn → tool execution → model turn, repeated until the model returns
|
|
133
|
+
* no further tool calls or `maxSteps` is reached. Required for built-in agent
|
|
134
|
+
* grader mode (filesystem-introspection rubrics).
|
|
135
|
+
*/
|
|
136
|
+
readonly tools?: readonly ProviderTool[];
|
|
137
|
+
/**
|
|
138
|
+
* Maximum number of agent loop iterations (model turn + tool execution = one
|
|
139
|
+
* step). Required when `tools` is non-empty. Ignored otherwise.
|
|
140
|
+
*/
|
|
141
|
+
readonly maxSteps?: number;
|
|
142
|
+
/**
|
|
143
|
+
* Image inputs appended to the last user turn. Used by graders that judge
|
|
144
|
+
* screenshot/image content (e.g. red-team UI evals). Providers that do not
|
|
145
|
+
* support multimodal input should drop these gracefully.
|
|
146
|
+
*/
|
|
147
|
+
readonly images?: readonly ContentImage[];
|
|
112
148
|
}
|
|
113
149
|
/**
|
|
114
150
|
* A tool call within an output message.
|
|
@@ -169,6 +205,16 @@ interface ProviderTokenUsage {
|
|
|
169
205
|
/** Reasoning/thinking tokens (optional, provider-specific) */
|
|
170
206
|
readonly reasoning?: number;
|
|
171
207
|
}
|
|
208
|
+
/**
|
|
209
|
+
* Per-step trace summary for tool-using provider calls. Populated only when
|
|
210
|
+
* the request had `tools`. Single-shot calls leave `steps` undefined.
|
|
211
|
+
*/
|
|
212
|
+
interface ProviderStepInfo {
|
|
213
|
+
/** Number of agent loop steps executed (1 = single model turn, no tool calls). */
|
|
214
|
+
readonly count: number;
|
|
215
|
+
/** Total tool calls across all steps. */
|
|
216
|
+
readonly toolCallCount: number;
|
|
217
|
+
}
|
|
172
218
|
interface ProviderResponse {
|
|
173
219
|
readonly raw?: unknown;
|
|
174
220
|
readonly usage?: JsonObject;
|
|
@@ -184,6 +230,8 @@ interface ProviderResponse {
|
|
|
184
230
|
readonly startTime?: string;
|
|
185
231
|
/** ISO 8601 timestamp when execution ended (optional) */
|
|
186
232
|
readonly endTime?: string;
|
|
233
|
+
/** Multi-step trace summary; populated only when the request used `tools`. */
|
|
234
|
+
readonly steps?: ProviderStepInfo;
|
|
187
235
|
/**
|
|
188
236
|
* Synthetic unified diff of files generated by the provider outside the
|
|
189
237
|
* eval workspace_path (e.g. copilot session-state artifacts in
|
|
@@ -216,11 +264,6 @@ interface Provider {
|
|
|
216
264
|
* the orchestrator may send multiple requests in a single provider session.
|
|
217
265
|
*/
|
|
218
266
|
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
219
|
-
/**
|
|
220
|
-
* Optional method to get a Vercel AI SDK LanguageModel instance for structured output generation.
|
|
221
|
-
* Used by evaluators that need generateObject/generateText from the AI SDK.
|
|
222
|
-
*/
|
|
223
|
-
asLanguageModel?(): ai.LanguageModel;
|
|
224
267
|
}
|
|
225
268
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
226
269
|
interface TargetDefinition {
|
|
@@ -676,6 +719,24 @@ type DockerWorkspaceConfig = {
|
|
|
676
719
|
/** CPU limit (e.g. 2, 0.5) */
|
|
677
720
|
readonly cpus?: number;
|
|
678
721
|
};
|
|
722
|
+
/**
|
|
723
|
+
* Preflight environment requirements for the workspace.
|
|
724
|
+
* Checked once before before_all hooks run. Fails fast if anything is missing.
|
|
725
|
+
*
|
|
726
|
+
* @example
|
|
727
|
+
* ```yaml
|
|
728
|
+
* workspace:
|
|
729
|
+
* env:
|
|
730
|
+
* required_commands: [ffmpeg, pandoc]
|
|
731
|
+
* required_python_modules: [PIL, openai]
|
|
732
|
+
* ```
|
|
733
|
+
*/
|
|
734
|
+
type WorkspaceEnvConfig = {
|
|
735
|
+
/** Shell commands that must be present in PATH (checked via `command -v`) */
|
|
736
|
+
readonly required_commands?: readonly string[];
|
|
737
|
+
/** Python modules that must be importable (checked via `python3 -c "import <module>"`) */
|
|
738
|
+
readonly required_python_modules?: readonly string[];
|
|
739
|
+
};
|
|
679
740
|
type WorkspaceConfig = {
|
|
680
741
|
/** Template directory or .code-workspace file. Directories are copied to temp workspace.
|
|
681
742
|
* .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
|
|
@@ -696,6 +757,8 @@ type WorkspaceConfig = {
|
|
|
696
757
|
* Used as default cwd for hook commands so that file-referenced templates resolve
|
|
697
758
|
* relative paths from their own directory, not the eval file's directory. */
|
|
698
759
|
readonly workspaceFileDir?: string;
|
|
760
|
+
/** Preflight environment requirements. Checked before before_all hooks run. */
|
|
761
|
+
readonly env?: WorkspaceEnvConfig;
|
|
699
762
|
};
|
|
700
763
|
type CodeGraderConfig = {
|
|
701
764
|
readonly name: string;
|
|
@@ -3006,7 +3069,10 @@ declare class LlmGrader implements Grader {
|
|
|
3006
3069
|
*/
|
|
3007
3070
|
private evaluateWithScoreRanges;
|
|
3008
3071
|
/**
|
|
3009
|
-
* Built-in mode:
|
|
3072
|
+
* Built-in mode: drives the grader through provider.invoke() with the
|
|
3073
|
+
* sandboxed filesystem tools and a step budget. The pi-ai-backed agentv
|
|
3074
|
+
* provider runs the agent loop (tool call → tool execute → next model
|
|
3075
|
+
* turn) until the model stops requesting tools or maxSteps is hit.
|
|
3010
3076
|
*/
|
|
3011
3077
|
private evaluateBuiltIn;
|
|
3012
3078
|
/**
|
|
@@ -3699,6 +3765,10 @@ interface GenerateRubricsOptions {
|
|
|
3699
3765
|
}
|
|
3700
3766
|
/**
|
|
3701
3767
|
* Generate rubrics from expected outcome using an LLM.
|
|
3768
|
+
*
|
|
3769
|
+
* Calls the provider through `Provider.invoke()` — the LLM call itself is
|
|
3770
|
+
* a single non-streaming, non-tool-using completion. JSON output is parsed
|
|
3771
|
+
* with up to 3 retries to absorb model formatting variance.
|
|
3702
3772
|
*/
|
|
3703
3773
|
declare function generateRubrics(options: GenerateRubricsOptions): Promise<readonly RubricItem[]>;
|
|
3704
3774
|
|
|
@@ -4798,4 +4868,4 @@ type AgentKernel = {
|
|
|
4798
4868
|
};
|
|
4799
4869
|
declare function createAgentKernel(): AgentKernel;
|
|
4800
4870
|
|
|
4801
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directPushResults, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, parseYamlValue, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
4871
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceEnvConfig, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directPushResults, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, parseYamlValue, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import * as ai from 'ai';
|
|
2
1
|
import { z } from 'zod';
|
|
3
2
|
|
|
4
3
|
/**
|
|
@@ -85,6 +84,25 @@ interface ProviderStreamCallbacks {
|
|
|
85
84
|
rootSpanId: string;
|
|
86
85
|
} | null;
|
|
87
86
|
}
|
|
87
|
+
/**
|
|
88
|
+
* A tool the model may call during multi-step provider execution. Pi-ai-shaped:
|
|
89
|
+
* the parameter shape is JSON Schema (provider-library-neutral wire format),
|
|
90
|
+
* and execute() is invoked by the provider once the model emits a tool call.
|
|
91
|
+
*/
|
|
92
|
+
interface ProviderTool {
|
|
93
|
+
/** Tool name as shown to the model. */
|
|
94
|
+
readonly name: string;
|
|
95
|
+
/** Tool description as shown to the model. */
|
|
96
|
+
readonly description: string;
|
|
97
|
+
/** JSON Schema for the tool's input. */
|
|
98
|
+
readonly parameters: JsonObject;
|
|
99
|
+
/**
|
|
100
|
+
* Executes the tool. Receives the parsed input the model produced. Errors
|
|
101
|
+
* are caught and surfaced to the model as tool-error results; the loop
|
|
102
|
+
* continues unless `maxSteps` is reached.
|
|
103
|
+
*/
|
|
104
|
+
execute(input: unknown): Promise<unknown> | unknown;
|
|
105
|
+
}
|
|
88
106
|
interface ProviderRequest {
|
|
89
107
|
readonly question: string;
|
|
90
108
|
readonly systemPrompt?: string;
|
|
@@ -109,6 +127,24 @@ interface ProviderRequest {
|
|
|
109
127
|
readonly parentSpanId: string;
|
|
110
128
|
readonly rootSpanId: string;
|
|
111
129
|
};
|
|
130
|
+
/**
|
|
131
|
+
* Tools the model may call. When provided, the provider runs the agent loop:
|
|
132
|
+
* model turn → tool execution → model turn, repeated until the model returns
|
|
133
|
+
* no further tool calls or `maxSteps` is reached. Required for built-in agent
|
|
134
|
+
* grader mode (filesystem-introspection rubrics).
|
|
135
|
+
*/
|
|
136
|
+
readonly tools?: readonly ProviderTool[];
|
|
137
|
+
/**
|
|
138
|
+
* Maximum number of agent loop iterations (model turn + tool execution = one
|
|
139
|
+
* step). Required when `tools` is non-empty. Ignored otherwise.
|
|
140
|
+
*/
|
|
141
|
+
readonly maxSteps?: number;
|
|
142
|
+
/**
|
|
143
|
+
* Image inputs appended to the last user turn. Used by graders that judge
|
|
144
|
+
* screenshot/image content (e.g. red-team UI evals). Providers that do not
|
|
145
|
+
* support multimodal input should drop these gracefully.
|
|
146
|
+
*/
|
|
147
|
+
readonly images?: readonly ContentImage[];
|
|
112
148
|
}
|
|
113
149
|
/**
|
|
114
150
|
* A tool call within an output message.
|
|
@@ -169,6 +205,16 @@ interface ProviderTokenUsage {
|
|
|
169
205
|
/** Reasoning/thinking tokens (optional, provider-specific) */
|
|
170
206
|
readonly reasoning?: number;
|
|
171
207
|
}
|
|
208
|
+
/**
|
|
209
|
+
* Per-step trace summary for tool-using provider calls. Populated only when
|
|
210
|
+
* the request had `tools`. Single-shot calls leave `steps` undefined.
|
|
211
|
+
*/
|
|
212
|
+
interface ProviderStepInfo {
|
|
213
|
+
/** Number of agent loop steps executed (1 = single model turn, no tool calls). */
|
|
214
|
+
readonly count: number;
|
|
215
|
+
/** Total tool calls across all steps. */
|
|
216
|
+
readonly toolCallCount: number;
|
|
217
|
+
}
|
|
172
218
|
interface ProviderResponse {
|
|
173
219
|
readonly raw?: unknown;
|
|
174
220
|
readonly usage?: JsonObject;
|
|
@@ -184,6 +230,8 @@ interface ProviderResponse {
|
|
|
184
230
|
readonly startTime?: string;
|
|
185
231
|
/** ISO 8601 timestamp when execution ended (optional) */
|
|
186
232
|
readonly endTime?: string;
|
|
233
|
+
/** Multi-step trace summary; populated only when the request used `tools`. */
|
|
234
|
+
readonly steps?: ProviderStepInfo;
|
|
187
235
|
/**
|
|
188
236
|
* Synthetic unified diff of files generated by the provider outside the
|
|
189
237
|
* eval workspace_path (e.g. copilot session-state artifacts in
|
|
@@ -216,11 +264,6 @@ interface Provider {
|
|
|
216
264
|
* the orchestrator may send multiple requests in a single provider session.
|
|
217
265
|
*/
|
|
218
266
|
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
219
|
-
/**
|
|
220
|
-
* Optional method to get a Vercel AI SDK LanguageModel instance for structured output generation.
|
|
221
|
-
* Used by evaluators that need generateObject/generateText from the AI SDK.
|
|
222
|
-
*/
|
|
223
|
-
asLanguageModel?(): ai.LanguageModel;
|
|
224
267
|
}
|
|
225
268
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
226
269
|
interface TargetDefinition {
|
|
@@ -676,6 +719,24 @@ type DockerWorkspaceConfig = {
|
|
|
676
719
|
/** CPU limit (e.g. 2, 0.5) */
|
|
677
720
|
readonly cpus?: number;
|
|
678
721
|
};
|
|
722
|
+
/**
|
|
723
|
+
* Preflight environment requirements for the workspace.
|
|
724
|
+
* Checked once before before_all hooks run. Fails fast if anything is missing.
|
|
725
|
+
*
|
|
726
|
+
* @example
|
|
727
|
+
* ```yaml
|
|
728
|
+
* workspace:
|
|
729
|
+
* env:
|
|
730
|
+
* required_commands: [ffmpeg, pandoc]
|
|
731
|
+
* required_python_modules: [PIL, openai]
|
|
732
|
+
* ```
|
|
733
|
+
*/
|
|
734
|
+
type WorkspaceEnvConfig = {
|
|
735
|
+
/** Shell commands that must be present in PATH (checked via `command -v`) */
|
|
736
|
+
readonly required_commands?: readonly string[];
|
|
737
|
+
/** Python modules that must be importable (checked via `python3 -c "import <module>"`) */
|
|
738
|
+
readonly required_python_modules?: readonly string[];
|
|
739
|
+
};
|
|
679
740
|
type WorkspaceConfig = {
|
|
680
741
|
/** Template directory or .code-workspace file. Directories are copied to temp workspace.
|
|
681
742
|
* .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
|
|
@@ -696,6 +757,8 @@ type WorkspaceConfig = {
|
|
|
696
757
|
* Used as default cwd for hook commands so that file-referenced templates resolve
|
|
697
758
|
* relative paths from their own directory, not the eval file's directory. */
|
|
698
759
|
readonly workspaceFileDir?: string;
|
|
760
|
+
/** Preflight environment requirements. Checked before before_all hooks run. */
|
|
761
|
+
readonly env?: WorkspaceEnvConfig;
|
|
699
762
|
};
|
|
700
763
|
type CodeGraderConfig = {
|
|
701
764
|
readonly name: string;
|
|
@@ -3006,7 +3069,10 @@ declare class LlmGrader implements Grader {
|
|
|
3006
3069
|
*/
|
|
3007
3070
|
private evaluateWithScoreRanges;
|
|
3008
3071
|
/**
|
|
3009
|
-
* Built-in mode:
|
|
3072
|
+
* Built-in mode: drives the grader through provider.invoke() with the
|
|
3073
|
+
* sandboxed filesystem tools and a step budget. The pi-ai-backed agentv
|
|
3074
|
+
* provider runs the agent loop (tool call → tool execute → next model
|
|
3075
|
+
* turn) until the model stops requesting tools or maxSteps is hit.
|
|
3010
3076
|
*/
|
|
3011
3077
|
private evaluateBuiltIn;
|
|
3012
3078
|
/**
|
|
@@ -3699,6 +3765,10 @@ interface GenerateRubricsOptions {
|
|
|
3699
3765
|
}
|
|
3700
3766
|
/**
|
|
3701
3767
|
* Generate rubrics from expected outcome using an LLM.
|
|
3768
|
+
*
|
|
3769
|
+
* Calls the provider through `Provider.invoke()` — the LLM call itself is
|
|
3770
|
+
* a single non-streaming, non-tool-using completion. JSON output is parsed
|
|
3771
|
+
* with up to 3 retries to absorb model formatting variance.
|
|
3702
3772
|
*/
|
|
3703
3773
|
declare function generateRubrics(options: GenerateRubricsOptions): Promise<readonly RubricItem[]>;
|
|
3704
3774
|
|
|
@@ -4798,4 +4868,4 @@ type AgentKernel = {
|
|
|
4798
4868
|
};
|
|
4799
4869
|
declare function createAgentKernel(): AgentKernel;
|
|
4800
4870
|
|
|
4801
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directPushResults, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, parseYamlValue, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
4871
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceEnvConfig, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directPushResults, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, parseYamlValue, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.js
CHANGED
|
@@ -129,7 +129,7 @@ import {
|
|
|
129
129
|
toCamelCaseDeep,
|
|
130
130
|
toSnakeCaseDeep,
|
|
131
131
|
tokensPerTool
|
|
132
|
-
} from "./chunk-
|
|
132
|
+
} from "./chunk-F234XBWV.js";
|
|
133
133
|
import {
|
|
134
134
|
COMMON_TARGET_SETTINGS,
|
|
135
135
|
TEST_MESSAGE_ROLES,
|
|
@@ -154,9 +154,9 @@ import {
|
|
|
154
154
|
resolveDelegatedTargetDefinition,
|
|
155
155
|
resolveFileReference,
|
|
156
156
|
resolveTargetDefinition
|
|
157
|
-
} from "./chunk-
|
|
157
|
+
} from "./chunk-CALQDF2Y.js";
|
|
158
158
|
import "./chunk-3WGHC7LC.js";
|
|
159
|
-
import "./chunk-
|
|
159
|
+
import "./chunk-5XV3FAAD.js";
|
|
160
160
|
import {
|
|
161
161
|
OtlpJsonFileExporter
|
|
162
162
|
} from "./chunk-KPSI5CSL.js";
|
|
@@ -505,7 +505,6 @@ async function loadTsConfig(projectRoot) {
|
|
|
505
505
|
}
|
|
506
506
|
|
|
507
507
|
// src/evaluation/generators/rubric-generator.ts
|
|
508
|
-
import { generateText } from "ai";
|
|
509
508
|
import { z as z2 } from "zod";
|
|
510
509
|
var rubricItemSchema = z2.object({
|
|
511
510
|
id: z2.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
@@ -519,10 +518,6 @@ var rubricGenerationSchema = z2.object({
|
|
|
519
518
|
async function generateRubrics(options) {
|
|
520
519
|
const { criteria, question, referenceAnswer, provider } = options;
|
|
521
520
|
const prompt = buildPrompt(criteria, question, referenceAnswer);
|
|
522
|
-
const model = provider.asLanguageModel?.();
|
|
523
|
-
if (!model) {
|
|
524
|
-
throw new Error("Provider does not support language model interface");
|
|
525
|
-
}
|
|
526
521
|
const system = `You are an expert at creating evaluation rubrics.
|
|
527
522
|
You must return a valid JSON object matching this schema:
|
|
528
523
|
{
|
|
@@ -539,11 +534,11 @@ You must return a valid JSON object matching this schema:
|
|
|
539
534
|
let lastError;
|
|
540
535
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
541
536
|
try {
|
|
542
|
-
const
|
|
543
|
-
|
|
544
|
-
system
|
|
545
|
-
prompt
|
|
537
|
+
const response = await provider.invoke({
|
|
538
|
+
question: prompt,
|
|
539
|
+
systemPrompt: system
|
|
546
540
|
});
|
|
541
|
+
const text = extractLastAssistantContent(response.output);
|
|
547
542
|
const cleaned = text.replace(/```json\n?|```/g, "").trim();
|
|
548
543
|
result = rubricGenerationSchema.parse(JSON.parse(cleaned));
|
|
549
544
|
break;
|