@agentv/core 3.4.0 → 3.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agentv-provider-NFFLXG5M.js +7 -0
- package/dist/{chunk-JO4HIAEF.js → chunk-EFR4JHPL.js} +1 -5
- package/dist/chunk-EFR4JHPL.js.map +1 -0
- package/dist/{chunk-Q52FQPKQ.js → chunk-W5YDZWT4.js} +2 -2
- package/dist/chunk-W5YDZWT4.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +382 -436
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +55 -46
- package/dist/index.d.ts +55 -46
- package/dist/index.js +384 -435
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/agentv-provider-HDSAUUEF.js +0 -7
- package/dist/chunk-JO4HIAEF.js.map +0 -1
- package/dist/chunk-Q52FQPKQ.js.map +0 -1
- /package/dist/{agentv-provider-HDSAUUEF.js.map → agentv-provider-NFFLXG5M.js.map} +0 -0
package/dist/index.d.cts
CHANGED
|
@@ -393,6 +393,12 @@ interface ExecutionMetrics {
|
|
|
393
393
|
*/
|
|
394
394
|
declare function mergeExecutionMetrics(computed: TraceComputeResult, metrics?: ExecutionMetrics): TraceComputeResult;
|
|
395
395
|
|
|
396
|
+
/** A single assertion verdict with optional evidence. */
|
|
397
|
+
interface AssertionEntry {
|
|
398
|
+
readonly text: string;
|
|
399
|
+
readonly passed: boolean;
|
|
400
|
+
readonly evidence?: string;
|
|
401
|
+
}
|
|
396
402
|
/**
|
|
397
403
|
* JSON primitive values appearing in AgentV payloads.
|
|
398
404
|
*/
|
|
@@ -1127,11 +1133,9 @@ interface EvaluationResult {
|
|
|
1127
1133
|
readonly dataset?: string;
|
|
1128
1134
|
readonly conversationId?: string;
|
|
1129
1135
|
readonly score: number;
|
|
1130
|
-
readonly
|
|
1131
|
-
readonly misses: readonly string[];
|
|
1136
|
+
readonly assertions: readonly AssertionEntry[];
|
|
1132
1137
|
readonly answer: string;
|
|
1133
1138
|
readonly target: string;
|
|
1134
|
-
readonly reasoning?: string;
|
|
1135
1139
|
/** Token usage metrics from provider (optional) */
|
|
1136
1140
|
readonly tokenUsage?: TokenUsage;
|
|
1137
1141
|
/** Total cost in USD (optional, from provider) */
|
|
@@ -1196,9 +1200,7 @@ interface EvaluatorResult {
|
|
|
1196
1200
|
readonly score: number;
|
|
1197
1201
|
readonly weight?: number;
|
|
1198
1202
|
readonly verdict?: EvaluationVerdict;
|
|
1199
|
-
readonly
|
|
1200
|
-
readonly misses: readonly string[];
|
|
1201
|
-
readonly reasoning?: string;
|
|
1203
|
+
readonly assertions: readonly AssertionEntry[];
|
|
1202
1204
|
readonly rawRequest?: JsonObject;
|
|
1203
1205
|
readonly evaluatorProviderRequest?: JsonObject;
|
|
1204
1206
|
readonly scores?: readonly EvaluatorResult[];
|
|
@@ -1213,10 +1215,6 @@ interface EvaluatorResult {
|
|
|
1213
1215
|
/** ISO 8601 UTC timestamp when this grader finished executing. */
|
|
1214
1216
|
readonly endedAt?: string;
|
|
1215
1217
|
}
|
|
1216
|
-
/**
|
|
1217
|
-
* Convenience accessor matching the Python hit_count property.
|
|
1218
|
-
*/
|
|
1219
|
-
declare function getHitCount(result: Pick<EvaluationResult, 'hits'>): number;
|
|
1220
1218
|
|
|
1221
1219
|
declare const MetadataSchema: z.ZodObject<{
|
|
1222
1220
|
name: z.ZodString;
|
|
@@ -2059,10 +2057,8 @@ interface EvaluationContext {
|
|
|
2059
2057
|
interface EvaluationScore {
|
|
2060
2058
|
readonly score: number;
|
|
2061
2059
|
readonly verdict: EvaluationVerdict;
|
|
2062
|
-
readonly
|
|
2063
|
-
readonly misses: readonly string[];
|
|
2060
|
+
readonly assertions: readonly AssertionEntry[];
|
|
2064
2061
|
readonly expectedAspectCount: number;
|
|
2065
|
-
readonly reasoning?: string;
|
|
2066
2062
|
readonly evaluatorRawRequest?: JsonObject;
|
|
2067
2063
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
2068
2064
|
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
|
|
@@ -2076,9 +2072,7 @@ interface ChildEvaluatorResult {
|
|
|
2076
2072
|
readonly score: number;
|
|
2077
2073
|
readonly weight?: number;
|
|
2078
2074
|
readonly verdict: EvaluationVerdict;
|
|
2079
|
-
readonly
|
|
2080
|
-
readonly misses: readonly string[];
|
|
2081
|
-
readonly reasoning?: string;
|
|
2075
|
+
readonly assertions: readonly AssertionEntry[];
|
|
2082
2076
|
readonly evaluatorRawRequest?: JsonObject;
|
|
2083
2077
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
2084
2078
|
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
|
|
@@ -2103,7 +2097,7 @@ declare function parseJsonSafe(payload: string): Record<string, unknown> | undef
|
|
|
2103
2097
|
declare function deepEqual(a: unknown, b: unknown): boolean;
|
|
2104
2098
|
/**
|
|
2105
2099
|
* Negate an evaluation score: inverts score (1 - score), swaps pass/fail verdict,
|
|
2106
|
-
*
|
|
2100
|
+
* and flips passed on each assertion.
|
|
2107
2101
|
*/
|
|
2108
2102
|
declare function negateScore(score: EvaluationScore): EvaluationScore;
|
|
2109
2103
|
|
|
@@ -2171,7 +2165,7 @@ interface ExecutionMetricsEvaluatorOptions {
|
|
|
2171
2165
|
* Supports multiple threshold types: tool calls, LLM calls, tokens, cost, duration,
|
|
2172
2166
|
* and exploration ratio. Only specified thresholds are checked.
|
|
2173
2167
|
*
|
|
2174
|
-
* Score is proportional:
|
|
2168
|
+
* Score is proportional: passed / total assertions
|
|
2175
2169
|
*/
|
|
2176
2170
|
declare class ExecutionMetricsEvaluator implements Evaluator {
|
|
2177
2171
|
readonly kind = "execution-metrics";
|
|
@@ -2255,19 +2249,33 @@ interface LlmGraderEvaluatorOptions {
|
|
|
2255
2249
|
}
|
|
2256
2250
|
declare const freeformEvaluationSchema: z.ZodObject<{
|
|
2257
2251
|
score: z.ZodNumber;
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2252
|
+
assertions: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
2253
|
+
text: z.ZodString;
|
|
2254
|
+
passed: z.ZodBoolean;
|
|
2255
|
+
evidence: z.ZodOptional<z.ZodString>;
|
|
2256
|
+
}, "strip", z.ZodTypeAny, {
|
|
2257
|
+
text: string;
|
|
2258
|
+
passed: boolean;
|
|
2259
|
+
evidence?: string | undefined;
|
|
2260
|
+
}, {
|
|
2261
|
+
text: string;
|
|
2262
|
+
passed: boolean;
|
|
2263
|
+
evidence?: string | undefined;
|
|
2264
|
+
}>, "many">>;
|
|
2261
2265
|
}, "strip", z.ZodTypeAny, {
|
|
2262
2266
|
score: number;
|
|
2263
|
-
|
|
2264
|
-
|
|
2265
|
-
|
|
2267
|
+
assertions?: {
|
|
2268
|
+
text: string;
|
|
2269
|
+
passed: boolean;
|
|
2270
|
+
evidence?: string | undefined;
|
|
2271
|
+
}[] | undefined;
|
|
2266
2272
|
}, {
|
|
2267
2273
|
score: number;
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
|
|
2274
|
+
assertions?: {
|
|
2275
|
+
text: string;
|
|
2276
|
+
passed: boolean;
|
|
2277
|
+
evidence?: string | undefined;
|
|
2278
|
+
}[] | undefined;
|
|
2271
2279
|
}>;
|
|
2272
2280
|
declare const rubricEvaluationSchema: z.ZodObject<{
|
|
2273
2281
|
checks: z.ZodArray<z.ZodObject<{
|
|
@@ -2275,26 +2283,26 @@ declare const rubricEvaluationSchema: z.ZodObject<{
|
|
|
2275
2283
|
satisfied: z.ZodBoolean;
|
|
2276
2284
|
reasoning: z.ZodString;
|
|
2277
2285
|
}, "strip", z.ZodTypeAny, {
|
|
2278
|
-
reasoning: string;
|
|
2279
2286
|
id: string;
|
|
2287
|
+
reasoning: string;
|
|
2280
2288
|
satisfied: boolean;
|
|
2281
2289
|
}, {
|
|
2282
|
-
reasoning: string;
|
|
2283
2290
|
id: string;
|
|
2291
|
+
reasoning: string;
|
|
2284
2292
|
satisfied: boolean;
|
|
2285
2293
|
}>, "many">;
|
|
2286
2294
|
overall_reasoning: z.ZodString;
|
|
2287
2295
|
}, "strip", z.ZodTypeAny, {
|
|
2288
2296
|
checks: {
|
|
2289
|
-
reasoning: string;
|
|
2290
2297
|
id: string;
|
|
2298
|
+
reasoning: string;
|
|
2291
2299
|
satisfied: boolean;
|
|
2292
2300
|
}[];
|
|
2293
2301
|
overall_reasoning: string;
|
|
2294
2302
|
}, {
|
|
2295
2303
|
checks: {
|
|
2296
|
-
reasoning: string;
|
|
2297
2304
|
id: string;
|
|
2305
|
+
reasoning: string;
|
|
2298
2306
|
satisfied: boolean;
|
|
2299
2307
|
}[];
|
|
2300
2308
|
overall_reasoning: string;
|
|
@@ -2371,8 +2379,7 @@ declare function substituteVariables(template: string, variables: Record<string,
|
|
|
2371
2379
|
declare function calculateRubricScore(result: z.infer<typeof rubricEvaluationSchema>, rubrics: readonly RubricItem[]): {
|
|
2372
2380
|
score: number;
|
|
2373
2381
|
verdict: EvaluationVerdict;
|
|
2374
|
-
|
|
2375
|
-
misses: string[];
|
|
2382
|
+
assertions: AssertionEntry[];
|
|
2376
2383
|
};
|
|
2377
2384
|
/**
|
|
2378
2385
|
* Build the output schema for score-range rubric evaluation.
|
|
@@ -2474,12 +2481,14 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
|
2474
2481
|
* Deterministic assertion evaluators.
|
|
2475
2482
|
*
|
|
2476
2483
|
* Pure functions that check agent output against simple conditions
|
|
2477
|
-
* and return a binary score (0 or 1) with descriptive
|
|
2484
|
+
* and return a binary score (0 or 1) with descriptive assertions.
|
|
2478
2485
|
*/
|
|
2479
2486
|
type AssertionResult = {
|
|
2480
2487
|
score: number;
|
|
2481
|
-
|
|
2482
|
-
|
|
2488
|
+
assertions: {
|
|
2489
|
+
text: string;
|
|
2490
|
+
passed: boolean;
|
|
2491
|
+
}[];
|
|
2483
2492
|
};
|
|
2484
2493
|
/** Checks if `output` contains the given `value` substring. */
|
|
2485
2494
|
declare function runContainsAssertion(output: string, value: string): AssertionResult;
|
|
@@ -3067,10 +3076,6 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
3067
3076
|
maxCostUsd?: number | undefined;
|
|
3068
3077
|
}>>;
|
|
3069
3078
|
}, "strip", z.ZodTypeAny, {
|
|
3070
|
-
output?: {
|
|
3071
|
-
dir?: string | undefined;
|
|
3072
|
-
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
3073
|
-
} | undefined;
|
|
3074
3079
|
execution?: {
|
|
3075
3080
|
verbose?: boolean | undefined;
|
|
3076
3081
|
workers?: number | undefined;
|
|
@@ -3084,15 +3089,15 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
3084
3089
|
enabled?: boolean | undefined;
|
|
3085
3090
|
path?: string | undefined;
|
|
3086
3091
|
} | undefined;
|
|
3092
|
+
output?: {
|
|
3093
|
+
dir?: string | undefined;
|
|
3094
|
+
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
3095
|
+
} | undefined;
|
|
3087
3096
|
limits?: {
|
|
3088
3097
|
maxDurationMs?: number | undefined;
|
|
3089
3098
|
maxCostUsd?: number | undefined;
|
|
3090
3099
|
} | undefined;
|
|
3091
3100
|
}, {
|
|
3092
|
-
output?: {
|
|
3093
|
-
dir?: string | undefined;
|
|
3094
|
-
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
3095
|
-
} | undefined;
|
|
3096
3101
|
execution?: {
|
|
3097
3102
|
verbose?: boolean | undefined;
|
|
3098
3103
|
workers?: number | undefined;
|
|
@@ -3106,6 +3111,10 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
3106
3111
|
enabled?: boolean | undefined;
|
|
3107
3112
|
path?: string | undefined;
|
|
3108
3113
|
} | undefined;
|
|
3114
|
+
output?: {
|
|
3115
|
+
dir?: string | undefined;
|
|
3116
|
+
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
3117
|
+
} | undefined;
|
|
3109
3118
|
limits?: {
|
|
3110
3119
|
maxDurationMs?: number | undefined;
|
|
3111
3120
|
maxCostUsd?: number | undefined;
|
|
@@ -3429,7 +3438,7 @@ declare function getWorkspacePoolRoot(): string;
|
|
|
3429
3438
|
/**
|
|
3430
3439
|
* Trims an EvaluationResult for baseline storage.
|
|
3431
3440
|
* Strips large debug/audit fields (denylist approach) while preserving
|
|
3432
|
-
* all fields needed for regression comparison (scores,
|
|
3441
|
+
* all fields needed for regression comparison (scores, assertions, etc.).
|
|
3433
3442
|
*
|
|
3434
3443
|
* Returns a new object — the input is not mutated.
|
|
3435
3444
|
*/
|
|
@@ -3605,4 +3614,4 @@ type AgentKernel = {
|
|
|
3605
3614
|
};
|
|
3606
3615
|
declare function createAgentKernel(): AgentKernel;
|
|
3607
3616
|
|
|
3608
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome,
|
|
3617
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -393,6 +393,12 @@ interface ExecutionMetrics {
|
|
|
393
393
|
*/
|
|
394
394
|
declare function mergeExecutionMetrics(computed: TraceComputeResult, metrics?: ExecutionMetrics): TraceComputeResult;
|
|
395
395
|
|
|
396
|
+
/** A single assertion verdict with optional evidence. */
|
|
397
|
+
interface AssertionEntry {
|
|
398
|
+
readonly text: string;
|
|
399
|
+
readonly passed: boolean;
|
|
400
|
+
readonly evidence?: string;
|
|
401
|
+
}
|
|
396
402
|
/**
|
|
397
403
|
* JSON primitive values appearing in AgentV payloads.
|
|
398
404
|
*/
|
|
@@ -1127,11 +1133,9 @@ interface EvaluationResult {
|
|
|
1127
1133
|
readonly dataset?: string;
|
|
1128
1134
|
readonly conversationId?: string;
|
|
1129
1135
|
readonly score: number;
|
|
1130
|
-
readonly
|
|
1131
|
-
readonly misses: readonly string[];
|
|
1136
|
+
readonly assertions: readonly AssertionEntry[];
|
|
1132
1137
|
readonly answer: string;
|
|
1133
1138
|
readonly target: string;
|
|
1134
|
-
readonly reasoning?: string;
|
|
1135
1139
|
/** Token usage metrics from provider (optional) */
|
|
1136
1140
|
readonly tokenUsage?: TokenUsage;
|
|
1137
1141
|
/** Total cost in USD (optional, from provider) */
|
|
@@ -1196,9 +1200,7 @@ interface EvaluatorResult {
|
|
|
1196
1200
|
readonly score: number;
|
|
1197
1201
|
readonly weight?: number;
|
|
1198
1202
|
readonly verdict?: EvaluationVerdict;
|
|
1199
|
-
readonly
|
|
1200
|
-
readonly misses: readonly string[];
|
|
1201
|
-
readonly reasoning?: string;
|
|
1203
|
+
readonly assertions: readonly AssertionEntry[];
|
|
1202
1204
|
readonly rawRequest?: JsonObject;
|
|
1203
1205
|
readonly evaluatorProviderRequest?: JsonObject;
|
|
1204
1206
|
readonly scores?: readonly EvaluatorResult[];
|
|
@@ -1213,10 +1215,6 @@ interface EvaluatorResult {
|
|
|
1213
1215
|
/** ISO 8601 UTC timestamp when this grader finished executing. */
|
|
1214
1216
|
readonly endedAt?: string;
|
|
1215
1217
|
}
|
|
1216
|
-
/**
|
|
1217
|
-
* Convenience accessor matching the Python hit_count property.
|
|
1218
|
-
*/
|
|
1219
|
-
declare function getHitCount(result: Pick<EvaluationResult, 'hits'>): number;
|
|
1220
1218
|
|
|
1221
1219
|
declare const MetadataSchema: z.ZodObject<{
|
|
1222
1220
|
name: z.ZodString;
|
|
@@ -2059,10 +2057,8 @@ interface EvaluationContext {
|
|
|
2059
2057
|
interface EvaluationScore {
|
|
2060
2058
|
readonly score: number;
|
|
2061
2059
|
readonly verdict: EvaluationVerdict;
|
|
2062
|
-
readonly
|
|
2063
|
-
readonly misses: readonly string[];
|
|
2060
|
+
readonly assertions: readonly AssertionEntry[];
|
|
2064
2061
|
readonly expectedAspectCount: number;
|
|
2065
|
-
readonly reasoning?: string;
|
|
2066
2062
|
readonly evaluatorRawRequest?: JsonObject;
|
|
2067
2063
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
2068
2064
|
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
|
|
@@ -2076,9 +2072,7 @@ interface ChildEvaluatorResult {
|
|
|
2076
2072
|
readonly score: number;
|
|
2077
2073
|
readonly weight?: number;
|
|
2078
2074
|
readonly verdict: EvaluationVerdict;
|
|
2079
|
-
readonly
|
|
2080
|
-
readonly misses: readonly string[];
|
|
2081
|
-
readonly reasoning?: string;
|
|
2075
|
+
readonly assertions: readonly AssertionEntry[];
|
|
2082
2076
|
readonly evaluatorRawRequest?: JsonObject;
|
|
2083
2077
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
2084
2078
|
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
|
|
@@ -2103,7 +2097,7 @@ declare function parseJsonSafe(payload: string): Record<string, unknown> | undef
|
|
|
2103
2097
|
declare function deepEqual(a: unknown, b: unknown): boolean;
|
|
2104
2098
|
/**
|
|
2105
2099
|
* Negate an evaluation score: inverts score (1 - score), swaps pass/fail verdict,
|
|
2106
|
-
*
|
|
2100
|
+
* and flips passed on each assertion.
|
|
2107
2101
|
*/
|
|
2108
2102
|
declare function negateScore(score: EvaluationScore): EvaluationScore;
|
|
2109
2103
|
|
|
@@ -2171,7 +2165,7 @@ interface ExecutionMetricsEvaluatorOptions {
|
|
|
2171
2165
|
* Supports multiple threshold types: tool calls, LLM calls, tokens, cost, duration,
|
|
2172
2166
|
* and exploration ratio. Only specified thresholds are checked.
|
|
2173
2167
|
*
|
|
2174
|
-
* Score is proportional:
|
|
2168
|
+
* Score is proportional: passed / total assertions
|
|
2175
2169
|
*/
|
|
2176
2170
|
declare class ExecutionMetricsEvaluator implements Evaluator {
|
|
2177
2171
|
readonly kind = "execution-metrics";
|
|
@@ -2255,19 +2249,33 @@ interface LlmGraderEvaluatorOptions {
|
|
|
2255
2249
|
}
|
|
2256
2250
|
declare const freeformEvaluationSchema: z.ZodObject<{
|
|
2257
2251
|
score: z.ZodNumber;
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2252
|
+
assertions: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
2253
|
+
text: z.ZodString;
|
|
2254
|
+
passed: z.ZodBoolean;
|
|
2255
|
+
evidence: z.ZodOptional<z.ZodString>;
|
|
2256
|
+
}, "strip", z.ZodTypeAny, {
|
|
2257
|
+
text: string;
|
|
2258
|
+
passed: boolean;
|
|
2259
|
+
evidence?: string | undefined;
|
|
2260
|
+
}, {
|
|
2261
|
+
text: string;
|
|
2262
|
+
passed: boolean;
|
|
2263
|
+
evidence?: string | undefined;
|
|
2264
|
+
}>, "many">>;
|
|
2261
2265
|
}, "strip", z.ZodTypeAny, {
|
|
2262
2266
|
score: number;
|
|
2263
|
-
|
|
2264
|
-
|
|
2265
|
-
|
|
2267
|
+
assertions?: {
|
|
2268
|
+
text: string;
|
|
2269
|
+
passed: boolean;
|
|
2270
|
+
evidence?: string | undefined;
|
|
2271
|
+
}[] | undefined;
|
|
2266
2272
|
}, {
|
|
2267
2273
|
score: number;
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
|
|
2274
|
+
assertions?: {
|
|
2275
|
+
text: string;
|
|
2276
|
+
passed: boolean;
|
|
2277
|
+
evidence?: string | undefined;
|
|
2278
|
+
}[] | undefined;
|
|
2271
2279
|
}>;
|
|
2272
2280
|
declare const rubricEvaluationSchema: z.ZodObject<{
|
|
2273
2281
|
checks: z.ZodArray<z.ZodObject<{
|
|
@@ -2275,26 +2283,26 @@ declare const rubricEvaluationSchema: z.ZodObject<{
|
|
|
2275
2283
|
satisfied: z.ZodBoolean;
|
|
2276
2284
|
reasoning: z.ZodString;
|
|
2277
2285
|
}, "strip", z.ZodTypeAny, {
|
|
2278
|
-
reasoning: string;
|
|
2279
2286
|
id: string;
|
|
2287
|
+
reasoning: string;
|
|
2280
2288
|
satisfied: boolean;
|
|
2281
2289
|
}, {
|
|
2282
|
-
reasoning: string;
|
|
2283
2290
|
id: string;
|
|
2291
|
+
reasoning: string;
|
|
2284
2292
|
satisfied: boolean;
|
|
2285
2293
|
}>, "many">;
|
|
2286
2294
|
overall_reasoning: z.ZodString;
|
|
2287
2295
|
}, "strip", z.ZodTypeAny, {
|
|
2288
2296
|
checks: {
|
|
2289
|
-
reasoning: string;
|
|
2290
2297
|
id: string;
|
|
2298
|
+
reasoning: string;
|
|
2291
2299
|
satisfied: boolean;
|
|
2292
2300
|
}[];
|
|
2293
2301
|
overall_reasoning: string;
|
|
2294
2302
|
}, {
|
|
2295
2303
|
checks: {
|
|
2296
|
-
reasoning: string;
|
|
2297
2304
|
id: string;
|
|
2305
|
+
reasoning: string;
|
|
2298
2306
|
satisfied: boolean;
|
|
2299
2307
|
}[];
|
|
2300
2308
|
overall_reasoning: string;
|
|
@@ -2371,8 +2379,7 @@ declare function substituteVariables(template: string, variables: Record<string,
|
|
|
2371
2379
|
declare function calculateRubricScore(result: z.infer<typeof rubricEvaluationSchema>, rubrics: readonly RubricItem[]): {
|
|
2372
2380
|
score: number;
|
|
2373
2381
|
verdict: EvaluationVerdict;
|
|
2374
|
-
|
|
2375
|
-
misses: string[];
|
|
2382
|
+
assertions: AssertionEntry[];
|
|
2376
2383
|
};
|
|
2377
2384
|
/**
|
|
2378
2385
|
* Build the output schema for score-range rubric evaluation.
|
|
@@ -2474,12 +2481,14 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
|
2474
2481
|
* Deterministic assertion evaluators.
|
|
2475
2482
|
*
|
|
2476
2483
|
* Pure functions that check agent output against simple conditions
|
|
2477
|
-
* and return a binary score (0 or 1) with descriptive
|
|
2484
|
+
* and return a binary score (0 or 1) with descriptive assertions.
|
|
2478
2485
|
*/
|
|
2479
2486
|
type AssertionResult = {
|
|
2480
2487
|
score: number;
|
|
2481
|
-
|
|
2482
|
-
|
|
2488
|
+
assertions: {
|
|
2489
|
+
text: string;
|
|
2490
|
+
passed: boolean;
|
|
2491
|
+
}[];
|
|
2483
2492
|
};
|
|
2484
2493
|
/** Checks if `output` contains the given `value` substring. */
|
|
2485
2494
|
declare function runContainsAssertion(output: string, value: string): AssertionResult;
|
|
@@ -3067,10 +3076,6 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
3067
3076
|
maxCostUsd?: number | undefined;
|
|
3068
3077
|
}>>;
|
|
3069
3078
|
}, "strip", z.ZodTypeAny, {
|
|
3070
|
-
output?: {
|
|
3071
|
-
dir?: string | undefined;
|
|
3072
|
-
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
3073
|
-
} | undefined;
|
|
3074
3079
|
execution?: {
|
|
3075
3080
|
verbose?: boolean | undefined;
|
|
3076
3081
|
workers?: number | undefined;
|
|
@@ -3084,15 +3089,15 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
3084
3089
|
enabled?: boolean | undefined;
|
|
3085
3090
|
path?: string | undefined;
|
|
3086
3091
|
} | undefined;
|
|
3092
|
+
output?: {
|
|
3093
|
+
dir?: string | undefined;
|
|
3094
|
+
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
3095
|
+
} | undefined;
|
|
3087
3096
|
limits?: {
|
|
3088
3097
|
maxDurationMs?: number | undefined;
|
|
3089
3098
|
maxCostUsd?: number | undefined;
|
|
3090
3099
|
} | undefined;
|
|
3091
3100
|
}, {
|
|
3092
|
-
output?: {
|
|
3093
|
-
dir?: string | undefined;
|
|
3094
|
-
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
3095
|
-
} | undefined;
|
|
3096
3101
|
execution?: {
|
|
3097
3102
|
verbose?: boolean | undefined;
|
|
3098
3103
|
workers?: number | undefined;
|
|
@@ -3106,6 +3111,10 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
3106
3111
|
enabled?: boolean | undefined;
|
|
3107
3112
|
path?: string | undefined;
|
|
3108
3113
|
} | undefined;
|
|
3114
|
+
output?: {
|
|
3115
|
+
dir?: string | undefined;
|
|
3116
|
+
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
3117
|
+
} | undefined;
|
|
3109
3118
|
limits?: {
|
|
3110
3119
|
maxDurationMs?: number | undefined;
|
|
3111
3120
|
maxCostUsd?: number | undefined;
|
|
@@ -3429,7 +3438,7 @@ declare function getWorkspacePoolRoot(): string;
|
|
|
3429
3438
|
/**
|
|
3430
3439
|
* Trims an EvaluationResult for baseline storage.
|
|
3431
3440
|
* Strips large debug/audit fields (denylist approach) while preserving
|
|
3432
|
-
* all fields needed for regression comparison (scores,
|
|
3441
|
+
* all fields needed for regression comparison (scores, assertions, etc.).
|
|
3433
3442
|
*
|
|
3434
3443
|
* Returns a new object — the input is not mutated.
|
|
3435
3444
|
*/
|
|
@@ -3605,4 +3614,4 @@ type AgentKernel = {
|
|
|
3605
3614
|
};
|
|
3606
3615
|
declare function createAgentKernel(): AgentKernel;
|
|
3607
3616
|
|
|
3608
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome,
|
|
3617
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|