npm - @agentv/core - Versions diffs - 2.0.2 → 2.1.0 - Mend

@agentv/core 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.cts CHANGED Viewed

@@ -204,6 +204,15 @@ declare function isTestMessage(value: unknown): value is TestMessage;
 declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage"];
 type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
 declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
+/**
+ * Configuration for enabling target access in code_judge evaluators.
+ * When present, the runtime will start a local proxy server that allows
+ * the script to invoke configured targets without direct credential access.
+ */
+type TargetAccessConfig = {
+    /** Maximum number of target invocations allowed per execution (default: 50) */
+    readonly max_calls?: number;
+};
 type CodeEvaluatorConfig = {
     readonly name: string;
     readonly type: 'code';
@@ -214,6 +223,8 @@ type CodeEvaluatorConfig = {
     readonly weight?: number;
     /** Pass-through configuration for the code_judge script (any unrecognized YAML properties) */
     readonly config?: JsonObject;
+    /** When present, enables target access for the script via local proxy */
+    readonly target?: TargetAccessConfig;
 };
 type LlmJudgeEvaluatorConfig = {
     readonly name: string;
@@ -343,7 +354,6 @@ interface EvalCase {
     readonly guideline_paths: readonly string[];
     readonly guideline_patterns?: readonly string[];
     readonly file_paths: readonly string[];
-    readonly code_snippets: readonly string[];
     readonly expected_outcome: string;
     readonly evaluator?: EvaluatorKind;
     readonly evaluators?: readonly EvaluatorConfig[];
@@ -383,6 +393,8 @@ interface EvaluatorResult {
     readonly rawRequest?: JsonObject;
     readonly evaluatorProviderRequest?: JsonObject;
     readonly evaluatorResults?: readonly EvaluatorResult[];
+    /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
+    readonly details?: JsonObject;
 }
 /**
  * Convenience accessor matching the Python hit_count property.
@@ -566,10 +578,6 @@ interface TargetDefinition {
  * - 'lm': Embedded file content with XML tags (for language model providers)
  */
 type FormattingMode = 'agent' | 'lm';
-/**
- * Extract fenced code blocks from AgentV user segments.
- */
-declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
 /**
  * Build prompt inputs by consolidating user request context and guideline content.
@@ -962,6 +970,11 @@ declare function subscribeToClaudeCodeLogEntries(listener: ClaudeCodeLogListener
 declare function createProvider(target: ResolvedTarget): Provider;
 declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
+/**
+ * Function to resolve a target name to a provider.
+ * Used by code judges to support target override.
+ */
+type TargetResolver = (targetName: string) => Provider | undefined;
 interface EvaluationContext {
     readonly evalCase: EvalCase;
     readonly candidate: string;
@@ -982,6 +995,10 @@ interface EvaluationContext {
     readonly outputMessages?: readonly OutputMessage[];
     /** Lightweight summary of trace events (if available) */
     readonly traceSummary?: TraceSummary;
+    /** Resolver for target override in code judges */
+    readonly targetResolver?: TargetResolver;
+    /** List of available target names for code judges */
+    readonly availableTargets?: readonly string[];
 }
 interface EvaluationScore {
     readonly score: number;
@@ -992,6 +1009,8 @@ interface EvaluationScore {
     readonly reasoning?: string;
     readonly evaluatorRawRequest?: JsonObject;
     readonly evaluatorResults?: readonly ChildEvaluatorResult[];
+    /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
+    readonly details?: JsonObject;
 }
 interface ChildEvaluatorResult {
     readonly name: string;
@@ -1004,37 +1023,37 @@ interface ChildEvaluatorResult {
     readonly reasoning?: string;
     readonly evaluatorRawRequest?: JsonObject;
     readonly evaluatorResults?: readonly ChildEvaluatorResult[];
+    /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
+    readonly details?: JsonObject;
 }
 interface Evaluator {
     readonly kind: string;
     evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
 }
-type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
-interface LlmJudgeEvaluatorOptions {
-    readonly resolveJudgeProvider: JudgeProviderResolver;
-    readonly maxOutputTokens?: number;
-    readonly temperature?: number;
-    readonly evaluatorTemplate?: string;
-}
-declare class LlmJudgeEvaluator implements Evaluator {
-    readonly kind = "llm_judge";
-    private readonly resolveJudgeProvider;
-    private readonly maxOutputTokens?;
-    private readonly temperature?;
-    private readonly evaluatorTemplate?;
-    constructor(options: LlmJudgeEvaluatorOptions);
-    evaluate(context: EvaluationContext): Promise<EvaluationScore>;
-    private evaluateFreeform;
-    private evaluateWithRubrics;
-    private buildRubricPrompt;
-    private runWithRetry;
+interface EvaluatorFactory {
+    create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
 }
+declare function scoreToVerdict(score: number): EvaluationVerdict;
+declare function clampScore(value: number): number;
+declare function extractJsonBlob(text: string): string | undefined;
+declare function parseJsonFromText(text: string): unknown;
+declare function isNonEmptyString(value: unknown): value is string;
+declare function parseJsonSafe(payload: string): Record<string, unknown> | undefined;
+/**
+ * Deep equality check for two values.
+ * Handles primitives, arrays, and plain objects.
+ */
+declare function deepEqual(a: unknown, b: unknown): boolean;
 interface CodeEvaluatorOptions {
     readonly script: readonly string[];
     readonly cwd?: string;
     readonly agentTimeoutMs?: number;
     /** Pass-through configuration from YAML (any unrecognized properties) */
     readonly config?: Record<string, unknown>;
+    /** Target access config - when present, enables target invocation for the script */
+    readonly target?: TargetAccessConfig;
 }
 declare class CodeEvaluator implements Evaluator {
     readonly kind = "code";
@@ -1042,29 +1061,44 @@ declare class CodeEvaluator implements Evaluator {
     private readonly cwd?;
     private readonly agentTimeoutMs?;
     private readonly config?;
+    private readonly target?;
     constructor(options: CodeEvaluatorOptions);
     evaluate(context: EvaluationContext): Promise<EvaluationScore>;
 }
-interface ToolTrajectoryEvaluatorOptions {
-    readonly config: ToolTrajectoryEvaluatorConfig;
+declare function executeScript(scriptPath: readonly string[] | string, input: string, agentTimeoutMs?: number, cwd?: string, env?: Record<string, string>): Promise<string>;
+interface CompositeEvaluatorOptions {
+    readonly config: CompositeEvaluatorConfig;
+    readonly evaluatorFactory: EvaluatorFactory;
+    readonly cwd?: string;
 }
-declare class ToolTrajectoryEvaluator implements Evaluator {
-    readonly kind = "tool_trajectory";
+declare class CompositeEvaluator implements Evaluator {
+    readonly kind = "composite";
     private readonly config;
-    constructor(options: ToolTrajectoryEvaluatorOptions);
+    private readonly evaluatorFactory;
+    private readonly cwd?;
+    constructor(options: CompositeEvaluatorOptions);
+    evaluate(context: EvaluationContext): Promise<EvaluationScore>;
+    private aggregate;
+    private runWeightedAverage;
+    private runCodeAggregator;
+    private runLlmAggregator;
+}
+interface CostEvaluatorOptions {
+    readonly config: CostEvaluatorConfig;
+}
+/**
+ * Evaluator that checks execution cost against a budget.
+ * Uses traceSummary.costUsd from the evaluation context.
+ */
+declare class CostEvaluator implements Evaluator {
+    readonly kind = "cost";
+    private readonly config;
+    constructor(options: CostEvaluatorOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
-    /**
-     * Extract tool calls from output messages.
-     */
-    private extractToolCallsFromMessages;
-    /**
-     * Build a summary from extracted tool calls.
-     */
-    private buildSummary;
-    private evaluateAnyOrder;
-    private evaluateInOrder;
-    private evaluateExact;
 }
 interface FieldAccuracyEvaluatorOptions {
     readonly config: FieldAccuracyEvaluatorConfig;
 }
@@ -1103,26 +1137,7 @@ declare class FieldAccuracyEvaluator implements Evaluator {
      */
     private aggregateResults;
 }
-interface EvaluatorFactory {
-    create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
-}
-interface CompositeEvaluatorOptions {
-    readonly config: CompositeEvaluatorConfig;
-    readonly evaluatorFactory: EvaluatorFactory;
-    readonly cwd?: string;
-}
-declare class CompositeEvaluator implements Evaluator {
-    readonly kind = "composite";
-    private readonly config;
-    private readonly evaluatorFactory;
-    private readonly cwd?;
-    constructor(options: CompositeEvaluatorOptions);
-    evaluate(context: EvaluationContext): Promise<EvaluationScore>;
-    private aggregate;
-    private runWeightedAverage;
-    private runCodeAggregator;
-    private runLlmAggregator;
-}
 interface LatencyEvaluatorOptions {
     readonly config: LatencyEvaluatorConfig;
 }
@@ -1136,19 +1151,50 @@ declare class LatencyEvaluator implements Evaluator {
     constructor(options: LatencyEvaluatorOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
 }
-interface CostEvaluatorOptions {
-    readonly config: CostEvaluatorConfig;
+type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
+interface LlmJudgeEvaluatorOptions {
+    readonly resolveJudgeProvider: JudgeProviderResolver;
+    readonly maxOutputTokens?: number;
+    readonly temperature?: number;
+    readonly evaluatorTemplate?: string;
+}
+declare const freeformEvaluationSchema: z.ZodObject<{
+    score: z.ZodNumber;
+    hits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
+    misses: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
+    reasoning: z.ZodOptional<z.ZodString>;
+}, "strip", z.ZodTypeAny, {
+    score: number;
+    hits?: string[] | undefined;
+    misses?: string[] | undefined;
+    reasoning?: string | undefined;
+}, {
+    score: number;
+    hits?: string[] | undefined;
+    misses?: string[] | undefined;
+    reasoning?: string | undefined;
+}>;
+declare class LlmJudgeEvaluator implements Evaluator {
+    readonly kind = "llm_judge";
+    private readonly resolveJudgeProvider;
+    private readonly maxOutputTokens?;
+    private readonly temperature?;
+    private readonly evaluatorTemplate?;
+    constructor(options: LlmJudgeEvaluatorOptions);
+    evaluate(context: EvaluationContext): Promise<EvaluationScore>;
+    private evaluateFreeform;
+    private evaluateWithRubrics;
+    private buildRubricPrompt;
+    private runWithRetry;
 }
 /**
- * Evaluator that checks execution cost against a budget.
- * Uses traceSummary.costUsd from the evaluation context.
+ * Build the mandatory output schema that all evaluators must follow.
+ * This schema is always appended to the evaluator template.
  */
-declare class CostEvaluator implements Evaluator {
-    readonly kind = "cost";
-    private readonly config;
-    constructor(options: CostEvaluatorOptions);
-    evaluate(context: EvaluationContext): EvaluationScore;
-}
+declare function buildOutputSchema(): string;
 interface TokenUsageEvaluatorOptions {
     readonly config: TokenUsageEvaluatorConfig;
 }
@@ -1163,6 +1209,27 @@ declare class TokenUsageEvaluator implements Evaluator {
     evaluate(context: EvaluationContext): EvaluationScore;
 }
+interface ToolTrajectoryEvaluatorOptions {
+    readonly config: ToolTrajectoryEvaluatorConfig;
+}
+declare class ToolTrajectoryEvaluator implements Evaluator {
+    readonly kind = "tool_trajectory";
+    private readonly config;
+    constructor(options: ToolTrajectoryEvaluatorOptions);
+    evaluate(context: EvaluationContext): EvaluationScore;
+    /**
+     * Extract tool calls from output messages.
+     */
+    private extractToolCallsFromMessages;
+    /**
+     * Build a summary from extracted tool calls.
+     */
+    private buildSummary;
+    private evaluateAnyOrder;
+    private evaluateInOrder;
+    private evaluateExact;
+}
 type MaybePromise<T> = T | Promise<T>;
 interface EvaluationCache {
     get(key: string): MaybePromise<ProviderResponse | undefined>;
@@ -1182,6 +1249,10 @@ interface RunEvalCaseOptions {
     readonly useCache?: boolean;
     readonly signal?: AbortSignal;
     readonly judgeProvider?: Provider;
+    /** Resolver for target override in code judges */
+    readonly targetResolver?: (name: string) => Provider | undefined;
+    /** List of available target names for code judges */
+    readonly availableTargets?: readonly string[];
 }
 interface ProgressEvent {
     readonly workerId: number;
@@ -1230,4 +1301,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
+export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };

package/dist/index.d.ts CHANGED Viewed

@@ -204,6 +204,15 @@ declare function isTestMessage(value: unknown): value is TestMessage;
 declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage"];
 type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
 declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
+/**
+ * Configuration for enabling target access in code_judge evaluators.
+ * When present, the runtime will start a local proxy server that allows
+ * the script to invoke configured targets without direct credential access.
+ */
+type TargetAccessConfig = {
+    /** Maximum number of target invocations allowed per execution (default: 50) */
+    readonly max_calls?: number;
+};
 type CodeEvaluatorConfig = {
     readonly name: string;
     readonly type: 'code';
@@ -214,6 +223,8 @@ type CodeEvaluatorConfig = {
     readonly weight?: number;
     /** Pass-through configuration for the code_judge script (any unrecognized YAML properties) */
     readonly config?: JsonObject;
+    /** When present, enables target access for the script via local proxy */
+    readonly target?: TargetAccessConfig;
 };
 type LlmJudgeEvaluatorConfig = {
     readonly name: string;
@@ -343,7 +354,6 @@ interface EvalCase {
     readonly guideline_paths: readonly string[];
     readonly guideline_patterns?: readonly string[];
     readonly file_paths: readonly string[];
-    readonly code_snippets: readonly string[];
     readonly expected_outcome: string;
     readonly evaluator?: EvaluatorKind;
     readonly evaluators?: readonly EvaluatorConfig[];
@@ -383,6 +393,8 @@ interface EvaluatorResult {
     readonly rawRequest?: JsonObject;
     readonly evaluatorProviderRequest?: JsonObject;
     readonly evaluatorResults?: readonly EvaluatorResult[];
+    /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
+    readonly details?: JsonObject;
 }
 /**
  * Convenience accessor matching the Python hit_count property.
@@ -566,10 +578,6 @@ interface TargetDefinition {
  * - 'lm': Embedded file content with XML tags (for language model providers)
  */
 type FormattingMode = 'agent' | 'lm';
-/**
- * Extract fenced code blocks from AgentV user segments.
- */
-declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
 /**
  * Build prompt inputs by consolidating user request context and guideline content.
@@ -962,6 +970,11 @@ declare function subscribeToClaudeCodeLogEntries(listener: ClaudeCodeLogListener
 declare function createProvider(target: ResolvedTarget): Provider;
 declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
+/**
+ * Function to resolve a target name to a provider.
+ * Used by code judges to support target override.
+ */
+type TargetResolver = (targetName: string) => Provider | undefined;
 interface EvaluationContext {
     readonly evalCase: EvalCase;
     readonly candidate: string;
@@ -982,6 +995,10 @@ interface EvaluationContext {
     readonly outputMessages?: readonly OutputMessage[];
     /** Lightweight summary of trace events (if available) */
     readonly traceSummary?: TraceSummary;
+    /** Resolver for target override in code judges */
+    readonly targetResolver?: TargetResolver;
+    /** List of available target names for code judges */
+    readonly availableTargets?: readonly string[];
 }
 interface EvaluationScore {
     readonly score: number;
@@ -992,6 +1009,8 @@ interface EvaluationScore {
     readonly reasoning?: string;
     readonly evaluatorRawRequest?: JsonObject;
     readonly evaluatorResults?: readonly ChildEvaluatorResult[];
+    /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
+    readonly details?: JsonObject;
 }
 interface ChildEvaluatorResult {
     readonly name: string;
@@ -1004,37 +1023,37 @@ interface ChildEvaluatorResult {
     readonly reasoning?: string;
     readonly evaluatorRawRequest?: JsonObject;
     readonly evaluatorResults?: readonly ChildEvaluatorResult[];
+    /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
+    readonly details?: JsonObject;
 }
 interface Evaluator {
     readonly kind: string;
     evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
 }
-type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
-interface LlmJudgeEvaluatorOptions {
-    readonly resolveJudgeProvider: JudgeProviderResolver;
-    readonly maxOutputTokens?: number;
-    readonly temperature?: number;
-    readonly evaluatorTemplate?: string;
-}
-declare class LlmJudgeEvaluator implements Evaluator {
-    readonly kind = "llm_judge";
-    private readonly resolveJudgeProvider;
-    private readonly maxOutputTokens?;
-    private readonly temperature?;
-    private readonly evaluatorTemplate?;
-    constructor(options: LlmJudgeEvaluatorOptions);
-    evaluate(context: EvaluationContext): Promise<EvaluationScore>;
-    private evaluateFreeform;
-    private evaluateWithRubrics;
-    private buildRubricPrompt;
-    private runWithRetry;
+interface EvaluatorFactory {
+    create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
 }
+declare function scoreToVerdict(score: number): EvaluationVerdict;
+declare function clampScore(value: number): number;
+declare function extractJsonBlob(text: string): string | undefined;
+declare function parseJsonFromText(text: string): unknown;
+declare function isNonEmptyString(value: unknown): value is string;
+declare function parseJsonSafe(payload: string): Record<string, unknown> | undefined;
+/**
+ * Deep equality check for two values.
+ * Handles primitives, arrays, and plain objects.
+ */
+declare function deepEqual(a: unknown, b: unknown): boolean;
 interface CodeEvaluatorOptions {
     readonly script: readonly string[];
     readonly cwd?: string;
     readonly agentTimeoutMs?: number;
     /** Pass-through configuration from YAML (any unrecognized properties) */
     readonly config?: Record<string, unknown>;
+    /** Target access config - when present, enables target invocation for the script */
+    readonly target?: TargetAccessConfig;
 }
 declare class CodeEvaluator implements Evaluator {
     readonly kind = "code";
@@ -1042,29 +1061,44 @@ declare class CodeEvaluator implements Evaluator {
     private readonly cwd?;
     private readonly agentTimeoutMs?;
     private readonly config?;
+    private readonly target?;
     constructor(options: CodeEvaluatorOptions);
     evaluate(context: EvaluationContext): Promise<EvaluationScore>;
 }
-interface ToolTrajectoryEvaluatorOptions {
-    readonly config: ToolTrajectoryEvaluatorConfig;
+declare function executeScript(scriptPath: readonly string[] | string, input: string, agentTimeoutMs?: number, cwd?: string, env?: Record<string, string>): Promise<string>;
+interface CompositeEvaluatorOptions {
+    readonly config: CompositeEvaluatorConfig;
+    readonly evaluatorFactory: EvaluatorFactory;
+    readonly cwd?: string;
 }
-declare class ToolTrajectoryEvaluator implements Evaluator {
-    readonly kind = "tool_trajectory";
+declare class CompositeEvaluator implements Evaluator {
+    readonly kind = "composite";
     private readonly config;
-    constructor(options: ToolTrajectoryEvaluatorOptions);
+    private readonly evaluatorFactory;
+    private readonly cwd?;
+    constructor(options: CompositeEvaluatorOptions);
+    evaluate(context: EvaluationContext): Promise<EvaluationScore>;
+    private aggregate;
+    private runWeightedAverage;
+    private runCodeAggregator;
+    private runLlmAggregator;
+}
+interface CostEvaluatorOptions {
+    readonly config: CostEvaluatorConfig;
+}
+/**
+ * Evaluator that checks execution cost against a budget.
+ * Uses traceSummary.costUsd from the evaluation context.
+ */
+declare class CostEvaluator implements Evaluator {
+    readonly kind = "cost";
+    private readonly config;
+    constructor(options: CostEvaluatorOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
-    /**
-     * Extract tool calls from output messages.
-     */
-    private extractToolCallsFromMessages;
-    /**
-     * Build a summary from extracted tool calls.
-     */
-    private buildSummary;
-    private evaluateAnyOrder;
-    private evaluateInOrder;
-    private evaluateExact;
 }
 interface FieldAccuracyEvaluatorOptions {
     readonly config: FieldAccuracyEvaluatorConfig;
 }
@@ -1103,26 +1137,7 @@ declare class FieldAccuracyEvaluator implements Evaluator {
      */
     private aggregateResults;
 }
-interface EvaluatorFactory {
-    create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
-}
-interface CompositeEvaluatorOptions {
-    readonly config: CompositeEvaluatorConfig;
-    readonly evaluatorFactory: EvaluatorFactory;
-    readonly cwd?: string;
-}
-declare class CompositeEvaluator implements Evaluator {
-    readonly kind = "composite";
-    private readonly config;
-    private readonly evaluatorFactory;
-    private readonly cwd?;
-    constructor(options: CompositeEvaluatorOptions);
-    evaluate(context: EvaluationContext): Promise<EvaluationScore>;
-    private aggregate;
-    private runWeightedAverage;
-    private runCodeAggregator;
-    private runLlmAggregator;
-}
 interface LatencyEvaluatorOptions {
     readonly config: LatencyEvaluatorConfig;
 }
@@ -1136,19 +1151,50 @@ declare class LatencyEvaluator implements Evaluator {
     constructor(options: LatencyEvaluatorOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
 }
-interface CostEvaluatorOptions {
-    readonly config: CostEvaluatorConfig;
+type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
+interface LlmJudgeEvaluatorOptions {
+    readonly resolveJudgeProvider: JudgeProviderResolver;
+    readonly maxOutputTokens?: number;
+    readonly temperature?: number;
+    readonly evaluatorTemplate?: string;
+}
+declare const freeformEvaluationSchema: z.ZodObject<{
+    score: z.ZodNumber;
+    hits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
+    misses: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
+    reasoning: z.ZodOptional<z.ZodString>;
+}, "strip", z.ZodTypeAny, {
+    score: number;
+    hits?: string[] | undefined;
+    misses?: string[] | undefined;
+    reasoning?: string | undefined;
+}, {
+    score: number;
+    hits?: string[] | undefined;
+    misses?: string[] | undefined;
+    reasoning?: string | undefined;
+}>;
+declare class LlmJudgeEvaluator implements Evaluator {
+    readonly kind = "llm_judge";
+    private readonly resolveJudgeProvider;
+    private readonly maxOutputTokens?;
+    private readonly temperature?;
+    private readonly evaluatorTemplate?;
+    constructor(options: LlmJudgeEvaluatorOptions);
+    evaluate(context: EvaluationContext): Promise<EvaluationScore>;
+    private evaluateFreeform;
+    private evaluateWithRubrics;
+    private buildRubricPrompt;
+    private runWithRetry;
 }
 /**
- * Evaluator that checks execution cost against a budget.
- * Uses traceSummary.costUsd from the evaluation context.
+ * Build the mandatory output schema that all evaluators must follow.
+ * This schema is always appended to the evaluator template.
  */
-declare class CostEvaluator implements Evaluator {
-    readonly kind = "cost";
-    private readonly config;
-    constructor(options: CostEvaluatorOptions);
-    evaluate(context: EvaluationContext): EvaluationScore;
-}
+declare function buildOutputSchema(): string;
 interface TokenUsageEvaluatorOptions {
     readonly config: TokenUsageEvaluatorConfig;
 }
@@ -1163,6 +1209,27 @@ declare class TokenUsageEvaluator implements Evaluator {
     evaluate(context: EvaluationContext): EvaluationScore;
 }
+interface ToolTrajectoryEvaluatorOptions {
+    readonly config: ToolTrajectoryEvaluatorConfig;
+}
+declare class ToolTrajectoryEvaluator implements Evaluator {
+    readonly kind = "tool_trajectory";
+    private readonly config;
+    constructor(options: ToolTrajectoryEvaluatorOptions);
+    evaluate(context: EvaluationContext): EvaluationScore;
+    /**
+     * Extract tool calls from output messages.
+     */
+    private extractToolCallsFromMessages;
+    /**
+     * Build a summary from extracted tool calls.
+     */
+    private buildSummary;
+    private evaluateAnyOrder;
+    private evaluateInOrder;
+    private evaluateExact;
+}
 type MaybePromise<T> = T | Promise<T>;
 interface EvaluationCache {
     get(key: string): MaybePromise<ProviderResponse | undefined>;
@@ -1182,6 +1249,10 @@ interface RunEvalCaseOptions {
     readonly useCache?: boolean;
     readonly signal?: AbortSignal;
     readonly judgeProvider?: Provider;
+    /** Resolver for target override in code judges */
+    readonly targetResolver?: (name: string) => Provider | undefined;
+    /** List of available target names for code judges */
+    readonly availableTargets?: readonly string[];
 }
 interface ProgressEvent {
     readonly workerId: number;
@@ -1230,4 +1301,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
+export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };