npm - @agentv/core - Versions diffs - 3.4.0 → 3.5.0 - Mend

@agentv/core 3.4.0 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/agentv-provider-NFFLXG5M.js +7 -0
package/dist/{chunk-JO4HIAEF.js → chunk-EFR4JHPL.js} +1 -5
package/dist/chunk-EFR4JHPL.js.map +1 -0
package/dist/{chunk-Q52FQPKQ.js → chunk-W5YDZWT4.js} +2 -2
package/dist/chunk-W5YDZWT4.js.map +1 -0
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +1 -1
package/dist/index.cjs +382 -436
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +55 -46
package/dist/index.d.ts +55 -46
package/dist/index.js +384 -435
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/agentv-provider-HDSAUUEF.js +0 -7
package/dist/chunk-JO4HIAEF.js.map +0 -1
package/dist/chunk-Q52FQPKQ.js.map +0 -1
/package/dist/{agentv-provider-HDSAUUEF.js.map → agentv-provider-NFFLXG5M.js.map} +0 -0

package/dist/index.d.cts CHANGED Viewed

@@ -393,6 +393,12 @@ interface ExecutionMetrics {
  */
 declare function mergeExecutionMetrics(computed: TraceComputeResult, metrics?: ExecutionMetrics): TraceComputeResult;
+/** A single assertion verdict with optional evidence. */
+interface AssertionEntry {
+    readonly text: string;
+    readonly passed: boolean;
+    readonly evidence?: string;
+}
 /**
  * JSON primitive values appearing in AgentV payloads.
  */
@@ -1127,11 +1133,9 @@ interface EvaluationResult {
     readonly dataset?: string;
     readonly conversationId?: string;
     readonly score: number;
-    readonly hits: readonly string[];
-    readonly misses: readonly string[];
+    readonly assertions: readonly AssertionEntry[];
     readonly answer: string;
     readonly target: string;
-    readonly reasoning?: string;
     /** Token usage metrics from provider (optional) */
     readonly tokenUsage?: TokenUsage;
     /** Total cost in USD (optional, from provider) */
@@ -1196,9 +1200,7 @@ interface EvaluatorResult {
     readonly score: number;
     readonly weight?: number;
     readonly verdict?: EvaluationVerdict;
-    readonly hits: readonly string[];
-    readonly misses: readonly string[];
-    readonly reasoning?: string;
+    readonly assertions: readonly AssertionEntry[];
     readonly rawRequest?: JsonObject;
     readonly evaluatorProviderRequest?: JsonObject;
     readonly scores?: readonly EvaluatorResult[];
@@ -1213,10 +1215,6 @@ interface EvaluatorResult {
     /** ISO 8601 UTC timestamp when this grader finished executing. */
     readonly endedAt?: string;
 }
-/**
- * Convenience accessor matching the Python hit_count property.
- */
-declare function getHitCount(result: Pick<EvaluationResult, 'hits'>): number;
 declare const MetadataSchema: z.ZodObject<{
     name: z.ZodString;
@@ -2059,10 +2057,8 @@ interface EvaluationContext {
 interface EvaluationScore {
     readonly score: number;
     readonly verdict: EvaluationVerdict;
-    readonly hits: readonly string[];
-    readonly misses: readonly string[];
+    readonly assertions: readonly AssertionEntry[];
     readonly expectedAspectCount: number;
-    readonly reasoning?: string;
     readonly evaluatorRawRequest?: JsonObject;
     readonly scores?: readonly ChildEvaluatorResult[];
     /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
@@ -2076,9 +2072,7 @@ interface ChildEvaluatorResult {
     readonly score: number;
     readonly weight?: number;
     readonly verdict: EvaluationVerdict;
-    readonly hits: readonly string[];
-    readonly misses: readonly string[];
-    readonly reasoning?: string;
+    readonly assertions: readonly AssertionEntry[];
     readonly evaluatorRawRequest?: JsonObject;
     readonly scores?: readonly ChildEvaluatorResult[];
     /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
@@ -2103,7 +2097,7 @@ declare function parseJsonSafe(payload: string): Record<string, unknown> | undef
 declare function deepEqual(a: unknown, b: unknown): boolean;
 /**
  * Negate an evaluation score: inverts score (1 - score), swaps pass/fail verdict,
- * swaps hits/misses, and annotates reasoning.
+ * and flips passed on each assertion.
  */
 declare function negateScore(score: EvaluationScore): EvaluationScore;
@@ -2171,7 +2165,7 @@ interface ExecutionMetricsEvaluatorOptions {
  * Supports multiple threshold types: tool calls, LLM calls, tokens, cost, duration,
  * and exploration ratio. Only specified thresholds are checked.
  *
- * Score is proportional: hits.length / (hits.length + misses.length)
+ * Score is proportional: passed / total assertions
  */
 declare class ExecutionMetricsEvaluator implements Evaluator {
     readonly kind = "execution-metrics";
@@ -2255,19 +2249,33 @@ interface LlmGraderEvaluatorOptions {
 }
 declare const freeformEvaluationSchema: z.ZodObject<{
     score: z.ZodNumber;
-    hits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
-    misses: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
-    reasoning: z.ZodOptional<z.ZodString>;
+    assertions: z.ZodOptional<z.ZodArray<z.ZodObject<{
+        text: z.ZodString;
+        passed: z.ZodBoolean;
+        evidence: z.ZodOptional<z.ZodString>;
+    }, "strip", z.ZodTypeAny, {
+        text: string;
+        passed: boolean;
+        evidence?: string | undefined;
+    }, {
+        text: string;
+        passed: boolean;
+        evidence?: string | undefined;
+    }>, "many">>;
 }, "strip", z.ZodTypeAny, {
     score: number;
-    hits?: string[] | undefined;
-    misses?: string[] | undefined;
-    reasoning?: string | undefined;
+    assertions?: {
+        text: string;
+        passed: boolean;
+        evidence?: string | undefined;
+    }[] | undefined;
 }, {
     score: number;
-    hits?: string[] | undefined;
-    misses?: string[] | undefined;
-    reasoning?: string | undefined;
+    assertions?: {
+        text: string;
+        passed: boolean;
+        evidence?: string | undefined;
+    }[] | undefined;
 }>;
 declare const rubricEvaluationSchema: z.ZodObject<{
     checks: z.ZodArray<z.ZodObject<{
@@ -2275,26 +2283,26 @@ declare const rubricEvaluationSchema: z.ZodObject<{
         satisfied: z.ZodBoolean;
         reasoning: z.ZodString;
     }, "strip", z.ZodTypeAny, {
-        reasoning: string;
         id: string;
+        reasoning: string;
         satisfied: boolean;
     }, {
-        reasoning: string;
         id: string;
+        reasoning: string;
         satisfied: boolean;
     }>, "many">;
     overall_reasoning: z.ZodString;
 }, "strip", z.ZodTypeAny, {
     checks: {
-        reasoning: string;
         id: string;
+        reasoning: string;
         satisfied: boolean;
     }[];
     overall_reasoning: string;
 }, {
     checks: {
-        reasoning: string;
         id: string;
+        reasoning: string;
         satisfied: boolean;
     }[];
     overall_reasoning: string;
@@ -2371,8 +2379,7 @@ declare function substituteVariables(template: string, variables: Record<string,
 declare function calculateRubricScore(result: z.infer<typeof rubricEvaluationSchema>, rubrics: readonly RubricItem[]): {
     score: number;
     verdict: EvaluationVerdict;
-    hits: string[];
-    misses: string[];
+    assertions: AssertionEntry[];
 };
 /**
  * Build the output schema for score-range rubric evaluation.
@@ -2474,12 +2481,14 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
  * Deterministic assertion evaluators.
  *
  * Pure functions that check agent output against simple conditions
- * and return a binary score (0 or 1) with descriptive hits/misses.
+ * and return a binary score (0 or 1) with descriptive assertions.
  */
 type AssertionResult = {
     score: number;
-    hits: string[];
-    misses: string[];
+    assertions: {
+        text: string;
+        passed: boolean;
+    }[];
 };
 /** Checks if `output` contains the given `value` substring. */
 declare function runContainsAssertion(output: string, value: string): AssertionResult;
@@ -3067,10 +3076,6 @@ declare const AgentVConfigSchema: z.ZodObject<{
         maxCostUsd?: number | undefined;
     }>>;
 }, "strip", z.ZodTypeAny, {
-    output?: {
-        dir?: string | undefined;
-        format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
-    } | undefined;
     execution?: {
         verbose?: boolean | undefined;
         workers?: number | undefined;
@@ -3084,15 +3089,15 @@ declare const AgentVConfigSchema: z.ZodObject<{
         enabled?: boolean | undefined;
         path?: string | undefined;
     } | undefined;
+    output?: {
+        dir?: string | undefined;
+        format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
+    } | undefined;
     limits?: {
         maxDurationMs?: number | undefined;
         maxCostUsd?: number | undefined;
     } | undefined;
 }, {
-    output?: {
-        dir?: string | undefined;
-        format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
-    } | undefined;
     execution?: {
         verbose?: boolean | undefined;
         workers?: number | undefined;
@@ -3106,6 +3111,10 @@ declare const AgentVConfigSchema: z.ZodObject<{
         enabled?: boolean | undefined;
         path?: string | undefined;
     } | undefined;
+    output?: {
+        dir?: string | undefined;
+        format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
+    } | undefined;
     limits?: {
         maxDurationMs?: number | undefined;
         maxCostUsd?: number | undefined;
@@ -3429,7 +3438,7 @@ declare function getWorkspacePoolRoot(): string;
 /**
  * Trims an EvaluationResult for baseline storage.
  * Strips large debug/audit fields (denylist approach) while preserving
- * all fields needed for regression comparison (scores, hits, misses, etc.).
+ * all fields needed for regression comparison (scores, assertions, etc.).
  *
  * Returns a new object — the input is not mutated.
  */
@@ -3605,4 +3614,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getHitCount, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
+export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };

package/dist/index.d.ts CHANGED Viewed

@@ -393,6 +393,12 @@ interface ExecutionMetrics {
  */
 declare function mergeExecutionMetrics(computed: TraceComputeResult, metrics?: ExecutionMetrics): TraceComputeResult;
+/** A single assertion verdict with optional evidence. */
+interface AssertionEntry {
+    readonly text: string;
+    readonly passed: boolean;
+    readonly evidence?: string;
+}
 /**
  * JSON primitive values appearing in AgentV payloads.
  */
@@ -1127,11 +1133,9 @@ interface EvaluationResult {
     readonly dataset?: string;
     readonly conversationId?: string;
     readonly score: number;
-    readonly hits: readonly string[];
-    readonly misses: readonly string[];
+    readonly assertions: readonly AssertionEntry[];
     readonly answer: string;
     readonly target: string;
-    readonly reasoning?: string;
     /** Token usage metrics from provider (optional) */
     readonly tokenUsage?: TokenUsage;
     /** Total cost in USD (optional, from provider) */
@@ -1196,9 +1200,7 @@ interface EvaluatorResult {
     readonly score: number;
     readonly weight?: number;
     readonly verdict?: EvaluationVerdict;
-    readonly hits: readonly string[];
-    readonly misses: readonly string[];
-    readonly reasoning?: string;
+    readonly assertions: readonly AssertionEntry[];
     readonly rawRequest?: JsonObject;
     readonly evaluatorProviderRequest?: JsonObject;
     readonly scores?: readonly EvaluatorResult[];
@@ -1213,10 +1215,6 @@ interface EvaluatorResult {
     /** ISO 8601 UTC timestamp when this grader finished executing. */
     readonly endedAt?: string;
 }
-/**
- * Convenience accessor matching the Python hit_count property.
- */
-declare function getHitCount(result: Pick<EvaluationResult, 'hits'>): number;
 declare const MetadataSchema: z.ZodObject<{
     name: z.ZodString;
@@ -2059,10 +2057,8 @@ interface EvaluationContext {
 interface EvaluationScore {
     readonly score: number;
     readonly verdict: EvaluationVerdict;
-    readonly hits: readonly string[];
-    readonly misses: readonly string[];
+    readonly assertions: readonly AssertionEntry[];
     readonly expectedAspectCount: number;
-    readonly reasoning?: string;
     readonly evaluatorRawRequest?: JsonObject;
     readonly scores?: readonly ChildEvaluatorResult[];
     /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
@@ -2076,9 +2072,7 @@ interface ChildEvaluatorResult {
     readonly score: number;
     readonly weight?: number;
     readonly verdict: EvaluationVerdict;
-    readonly hits: readonly string[];
-    readonly misses: readonly string[];
-    readonly reasoning?: string;
+    readonly assertions: readonly AssertionEntry[];
     readonly evaluatorRawRequest?: JsonObject;
     readonly scores?: readonly ChildEvaluatorResult[];
     /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
@@ -2103,7 +2097,7 @@ declare function parseJsonSafe(payload: string): Record<string, unknown> | undef
 declare function deepEqual(a: unknown, b: unknown): boolean;
 /**
  * Negate an evaluation score: inverts score (1 - score), swaps pass/fail verdict,
- * swaps hits/misses, and annotates reasoning.
+ * and flips passed on each assertion.
  */
 declare function negateScore(score: EvaluationScore): EvaluationScore;
@@ -2171,7 +2165,7 @@ interface ExecutionMetricsEvaluatorOptions {
  * Supports multiple threshold types: tool calls, LLM calls, tokens, cost, duration,
  * and exploration ratio. Only specified thresholds are checked.
  *
- * Score is proportional: hits.length / (hits.length + misses.length)
+ * Score is proportional: passed / total assertions
  */
 declare class ExecutionMetricsEvaluator implements Evaluator {
     readonly kind = "execution-metrics";
@@ -2255,19 +2249,33 @@ interface LlmGraderEvaluatorOptions {
 }
 declare const freeformEvaluationSchema: z.ZodObject<{
     score: z.ZodNumber;
-    hits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
-    misses: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
-    reasoning: z.ZodOptional<z.ZodString>;
+    assertions: z.ZodOptional<z.ZodArray<z.ZodObject<{
+        text: z.ZodString;
+        passed: z.ZodBoolean;
+        evidence: z.ZodOptional<z.ZodString>;
+    }, "strip", z.ZodTypeAny, {
+        text: string;
+        passed: boolean;
+        evidence?: string | undefined;
+    }, {
+        text: string;
+        passed: boolean;
+        evidence?: string | undefined;
+    }>, "many">>;
 }, "strip", z.ZodTypeAny, {
     score: number;
-    hits?: string[] | undefined;
-    misses?: string[] | undefined;
-    reasoning?: string | undefined;
+    assertions?: {
+        text: string;
+        passed: boolean;
+        evidence?: string | undefined;
+    }[] | undefined;
 }, {
     score: number;
-    hits?: string[] | undefined;
-    misses?: string[] | undefined;
-    reasoning?: string | undefined;
+    assertions?: {
+        text: string;
+        passed: boolean;
+        evidence?: string | undefined;
+    }[] | undefined;
 }>;
 declare const rubricEvaluationSchema: z.ZodObject<{
     checks: z.ZodArray<z.ZodObject<{
@@ -2275,26 +2283,26 @@ declare const rubricEvaluationSchema: z.ZodObject<{
         satisfied: z.ZodBoolean;
         reasoning: z.ZodString;
     }, "strip", z.ZodTypeAny, {
-        reasoning: string;
         id: string;
+        reasoning: string;
         satisfied: boolean;
     }, {
-        reasoning: string;
         id: string;
+        reasoning: string;
         satisfied: boolean;
     }>, "many">;
     overall_reasoning: z.ZodString;
 }, "strip", z.ZodTypeAny, {
     checks: {
-        reasoning: string;
         id: string;
+        reasoning: string;
         satisfied: boolean;
     }[];
     overall_reasoning: string;
 }, {
     checks: {
-        reasoning: string;
         id: string;
+        reasoning: string;
         satisfied: boolean;
     }[];
     overall_reasoning: string;
@@ -2371,8 +2379,7 @@ declare function substituteVariables(template: string, variables: Record<string,
 declare function calculateRubricScore(result: z.infer<typeof rubricEvaluationSchema>, rubrics: readonly RubricItem[]): {
     score: number;
     verdict: EvaluationVerdict;
-    hits: string[];
-    misses: string[];
+    assertions: AssertionEntry[];
 };
 /**
  * Build the output schema for score-range rubric evaluation.
@@ -2474,12 +2481,14 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
  * Deterministic assertion evaluators.
  *
  * Pure functions that check agent output against simple conditions
- * and return a binary score (0 or 1) with descriptive hits/misses.
+ * and return a binary score (0 or 1) with descriptive assertions.
  */
 type AssertionResult = {
     score: number;
-    hits: string[];
-    misses: string[];
+    assertions: {
+        text: string;
+        passed: boolean;
+    }[];
 };
 /** Checks if `output` contains the given `value` substring. */
 declare function runContainsAssertion(output: string, value: string): AssertionResult;
@@ -3067,10 +3076,6 @@ declare const AgentVConfigSchema: z.ZodObject<{
         maxCostUsd?: number | undefined;
     }>>;
 }, "strip", z.ZodTypeAny, {
-    output?: {
-        dir?: string | undefined;
-        format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
-    } | undefined;
     execution?: {
         verbose?: boolean | undefined;
         workers?: number | undefined;
@@ -3084,15 +3089,15 @@ declare const AgentVConfigSchema: z.ZodObject<{
         enabled?: boolean | undefined;
         path?: string | undefined;
     } | undefined;
+    output?: {
+        dir?: string | undefined;
+        format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
+    } | undefined;
     limits?: {
         maxDurationMs?: number | undefined;
         maxCostUsd?: number | undefined;
     } | undefined;
 }, {
-    output?: {
-        dir?: string | undefined;
-        format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
-    } | undefined;
     execution?: {
         verbose?: boolean | undefined;
         workers?: number | undefined;
@@ -3106,6 +3111,10 @@ declare const AgentVConfigSchema: z.ZodObject<{
         enabled?: boolean | undefined;
         path?: string | undefined;
     } | undefined;
+    output?: {
+        dir?: string | undefined;
+        format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
+    } | undefined;
     limits?: {
         maxDurationMs?: number | undefined;
         maxCostUsd?: number | undefined;
@@ -3429,7 +3438,7 @@ declare function getWorkspacePoolRoot(): string;
 /**
  * Trims an EvaluationResult for baseline storage.
  * Strips large debug/audit fields (denylist approach) while preserving
- * all fields needed for regression comparison (scores, hits, misses, etc.).
+ * all fields needed for regression comparison (scores, assertions, etc.).
  *
  * Returns a new object — the input is not mutated.
  */
@@ -3605,4 +3614,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getHitCount, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
+export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };