npm - @agentv/core - Versions diffs - 1.5.0 → 2.0.2 - Mend

@agentv/core 1.5.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +77 -77
package/dist/{chunk-E2VSU4WZ.js → chunk-KDEP4I7G.js} +116 -1
package/dist/chunk-KDEP4I7G.js.map +1 -0
package/dist/evaluation/validation/index.cjs +2 -0
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +1 -1
package/dist/index.cjs +2715 -675
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +207 -10
package/dist/index.d.ts +207 -10
package/dist/index.js +2491 -570
package/dist/index.js.map +1 -1
package/package.json +8 -2
package/dist/chunk-E2VSU4WZ.js.map +0 -1

package/dist/index.d.cts CHANGED Viewed

@@ -201,17 +201,19 @@ declare function isJsonValue(value: unknown): value is JsonValue;
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
  */
 declare function isTestMessage(value: unknown): value is TestMessage;
-declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory"];
+declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage"];
 type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
 declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
 type CodeEvaluatorConfig = {
     readonly name: string;
     readonly type: 'code';
-    readonly script: string;
+    readonly script: readonly string[];
     readonly resolvedScriptPath?: string;
     readonly cwd?: string;
     readonly resolvedCwd?: string;
     readonly weight?: number;
+    /** Pass-through configuration for the code_judge script (any unrecognized YAML properties) */
+    readonly config?: JsonObject;
 };
 type LlmJudgeEvaluatorConfig = {
     readonly name: string;
@@ -247,7 +249,85 @@ type CompositeEvaluatorConfig = {
     readonly aggregator: CompositeAggregatorConfig;
     readonly weight?: number;
 };
-type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig;
+/**
+ * Match type for field accuracy evaluation.
+ * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code_judge evaluator.
+ * See examples/features/document-extraction/fuzzy_match.ts for an example.
+ */
+type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
+/**
+ * Aggregation strategy for combining field scores.
+ */
+type FieldAggregationType = 'weighted_average' | 'all_or_nothing';
+/**
+ * Configuration for a single field to evaluate.
+ */
+type FieldConfig = {
+    /** Dot-notation path to the field (e.g., "invoice.vendor.name" or "items[0].amount") */
+    readonly path: string;
+    /** Match strategy for this field */
+    readonly match: FieldMatchType;
+    /** Whether this field is required (missing required fields count as failures) */
+    readonly required?: boolean;
+    /** Weight for aggregation (default: 1.0) */
+    readonly weight?: number;
+    /** Tolerance for numeric matching (absolute value unless relative is true) */
+    readonly tolerance?: number;
+    /** Whether tolerance is relative (percentage) vs absolute */
+    readonly relative?: boolean;
+    /** Date formats to try when parsing (default: common formats) */
+    readonly formats?: readonly string[];
+};
+/**
+ * Configuration for the field_accuracy evaluator.
+ */
+type FieldAccuracyEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'field_accuracy';
+    /** Fields to compare between candidate and expected */
+    readonly fields: readonly FieldConfig[];
+    /** Strategy for combining field scores (default: weighted_average) */
+    readonly aggregation?: FieldAggregationType;
+    readonly weight?: number;
+};
+/**
+ * Configuration for the latency evaluator.
+ * Checks execution duration against a threshold.
+ */
+type LatencyEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'latency';
+    /** Maximum allowed duration in milliseconds */
+    readonly threshold: number;
+    readonly weight?: number;
+};
+/**
+ * Configuration for the cost evaluator.
+ * Checks execution cost against a budget.
+ */
+type CostEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'cost';
+    /** Maximum allowed cost in USD */
+    readonly budget: number;
+    readonly weight?: number;
+};
+/**
+ * Configuration for the token_usage evaluator.
+ * Checks provider-reported token usage against configured limits.
+ */
+type TokenUsageEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'token_usage';
+    /** Maximum allowed total tokens (input + output + cached, when present) */
+    readonly max_total?: number;
+    /** Maximum allowed input tokens (prompt) */
+    readonly max_input?: number;
+    /** Maximum allowed output tokens (completion) */
+    readonly max_output?: number;
+    readonly weight?: number;
+};
+type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig;
 /**
  * Eval case definition sourced from AgentV specs.
  */
@@ -282,7 +362,6 @@ interface EvaluationResult {
     readonly candidateAnswer: string;
     readonly target: string;
     readonly reasoning?: string;
-    readonly rawAspects?: readonly string[];
     readonly agentProviderRequest?: JsonObject;
     readonly lmProviderRequest?: JsonObject;
     readonly evaluatorProviderRequest?: JsonObject;
@@ -317,7 +396,7 @@ interface ChatMessage {
     readonly name?: string;
 }
 type ChatPrompt = readonly ChatMessage[];
-type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
+type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude-code' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
 interface ProviderRequest {
     readonly question: string;
     readonly systemPrompt?: string;
@@ -726,6 +805,23 @@ interface PiCodingAgentResolvedConfig {
     readonly logFormat?: 'summary' | 'json';
     readonly systemPrompt?: string;
 }
+interface PiAgentSdkResolvedConfig {
+    readonly provider?: string;
+    readonly model?: string;
+    readonly apiKey?: string;
+    readonly timeoutMs?: number;
+    readonly systemPrompt?: string;
+}
+interface ClaudeCodeResolvedConfig {
+    readonly executable: string;
+    readonly model?: string;
+    readonly systemPrompt?: string;
+    readonly args?: readonly string[];
+    readonly cwd?: string;
+    readonly timeoutMs?: number;
+    readonly logDir?: string;
+    readonly logFormat?: 'summary' | 'json';
+}
 interface MockResolvedConfig {
     readonly response?: string;
     readonly delayMs?: number;
@@ -774,6 +870,20 @@ type ResolvedTarget = {
     readonly workers?: number;
     readonly providerBatching?: boolean;
     readonly config: PiCodingAgentResolvedConfig;
+} | {
+    readonly kind: 'pi-agent-sdk';
+    readonly name: string;
+    readonly judgeTarget?: string;
+    readonly workers?: number;
+    readonly providerBatching?: boolean;
+    readonly config: PiAgentSdkResolvedConfig;
+} | {
+    readonly kind: 'claude-code';
+    readonly name: string;
+    readonly judgeTarget?: string;
+    readonly workers?: number;
+    readonly providerBatching?: boolean;
+    readonly config: ClaudeCodeResolvedConfig;
 } | {
     readonly kind: 'mock';
     readonly name: string;
@@ -839,6 +949,16 @@ type PiLogListener = (entry: PiLogEntry) => void;
 declare function consumePiLogEntries(): PiLogEntry[];
 declare function subscribeToPiLogEntries(listener: PiLogListener): () => void;
+type ClaudeCodeLogEntry = {
+    readonly filePath: string;
+    readonly evalCaseId?: string;
+    readonly targetName: string;
+    readonly attempt?: number;
+};
+type ClaudeCodeLogListener = (entry: ClaudeCodeLogEntry) => void;
+declare function consumeClaudeCodeLogEntries(): ClaudeCodeLogEntry[];
+declare function subscribeToClaudeCodeLogEntries(listener: ClaudeCodeLogListener): () => void;
 declare function createProvider(target: ResolvedTarget): Provider;
 declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
@@ -870,7 +990,6 @@ interface EvaluationScore {
     readonly misses: readonly string[];
     readonly expectedAspectCount: number;
     readonly reasoning?: string;
-    readonly rawAspects?: readonly string[];
     readonly evaluatorRawRequest?: JsonObject;
     readonly evaluatorResults?: readonly ChildEvaluatorResult[];
 }
@@ -911,15 +1030,18 @@ declare class LlmJudgeEvaluator implements Evaluator {
     private runWithRetry;
 }
 interface CodeEvaluatorOptions {
-    readonly script: string;
+    readonly script: readonly string[];
     readonly cwd?: string;
     readonly agentTimeoutMs?: number;
+    /** Pass-through configuration from YAML (any unrecognized properties) */
+    readonly config?: Record<string, unknown>;
 }
 declare class CodeEvaluator implements Evaluator {
     readonly kind = "code";
     private readonly script;
     private readonly cwd?;
     private readonly agentTimeoutMs?;
+    private readonly config?;
     constructor(options: CodeEvaluatorOptions);
     evaluate(context: EvaluationContext): Promise<EvaluationScore>;
 }
@@ -943,6 +1065,44 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
     private evaluateInOrder;
     private evaluateExact;
 }
+interface FieldAccuracyEvaluatorOptions {
+    readonly config: FieldAccuracyEvaluatorConfig;
+}
+/**
+ * FieldAccuracyEvaluator compares extracted structured data against expected values
+ * with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
+ */
+declare class FieldAccuracyEvaluator implements Evaluator {
+    readonly kind = "field_accuracy";
+    private readonly config;
+    constructor(options: FieldAccuracyEvaluatorOptions);
+    evaluate(context: EvaluationContext): EvaluationScore;
+    /**
+     * Extract expected data from expected_messages array.
+     * Looks for the last assistant message with content.
+     */
+    private extractExpectedData;
+    /**
+     * Evaluate a single field against the expected value.
+     */
+    private evaluateField;
+    /**
+     * Exact equality comparison.
+     */
+    private compareExact;
+    /**
+     * Numeric comparison with absolute or relative tolerance.
+     */
+    private compareNumericTolerance;
+    /**
+     * Date comparison with format normalization.
+     */
+    private compareDate;
+    /**
+     * Aggregate field results using configured strategy.
+     */
+    private aggregateResults;
+}
 interface EvaluatorFactory {
     create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
 }
@@ -963,6 +1123,45 @@ declare class CompositeEvaluator implements Evaluator {
     private runCodeAggregator;
     private runLlmAggregator;
 }
+interface LatencyEvaluatorOptions {
+    readonly config: LatencyEvaluatorConfig;
+}
+/**
+ * Evaluator that checks execution duration against a threshold.
+ * Uses traceSummary.durationMs from the evaluation context.
+ */
+declare class LatencyEvaluator implements Evaluator {
+    readonly kind = "latency";
+    private readonly config;
+    constructor(options: LatencyEvaluatorOptions);
+    evaluate(context: EvaluationContext): EvaluationScore;
+}
+interface CostEvaluatorOptions {
+    readonly config: CostEvaluatorConfig;
+}
+/**
+ * Evaluator that checks execution cost against a budget.
+ * Uses traceSummary.costUsd from the evaluation context.
+ */
+declare class CostEvaluator implements Evaluator {
+    readonly kind = "cost";
+    private readonly config;
+    constructor(options: CostEvaluatorOptions);
+    evaluate(context: EvaluationContext): EvaluationScore;
+}
+interface TokenUsageEvaluatorOptions {
+    readonly config: TokenUsageEvaluatorConfig;
+}
+/**
+ * Evaluator that checks provider-reported token usage against configured limits.
+ * Uses traceSummary.tokenUsage from the evaluation context.
+ */
+declare class TokenUsageEvaluator implements Evaluator {
+    readonly kind = "token_usage";
+    private readonly config;
+    constructor(options: TokenUsageEvaluatorOptions);
+    evaluate(context: EvaluationContext): EvaluationScore;
+}
 type MaybePromise<T> = T | Promise<T>;
 interface EvaluationCache {
@@ -979,7 +1178,6 @@ interface RunEvalCaseOptions {
     readonly now?: () => Date;
     readonly maxRetries?: number;
     readonly agentTimeoutMs?: number;
-    readonly promptDumpDir?: string;
     readonly cache?: EvaluationCache;
     readonly useCache?: boolean;
     readonly signal?: AbortSignal;
@@ -1003,7 +1201,6 @@ interface RunEvaluationOptions {
     readonly evaluators?: Partial<Record<string, Evaluator>>;
     readonly maxRetries?: number;
     readonly agentTimeoutMs?: number;
-    readonly promptDumpDir?: string;
     readonly cache?: EvaluationCache;
     readonly useCache?: boolean;
     readonly now?: () => Date;
@@ -1033,4 +1230,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
+export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };

package/dist/index.d.ts CHANGED Viewed

@@ -201,17 +201,19 @@ declare function isJsonValue(value: unknown): value is JsonValue;
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
  */
 declare function isTestMessage(value: unknown): value is TestMessage;
-declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory"];
+declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage"];
 type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
 declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
 type CodeEvaluatorConfig = {
     readonly name: string;
     readonly type: 'code';
-    readonly script: string;
+    readonly script: readonly string[];
     readonly resolvedScriptPath?: string;
     readonly cwd?: string;
     readonly resolvedCwd?: string;
     readonly weight?: number;
+    /** Pass-through configuration for the code_judge script (any unrecognized YAML properties) */
+    readonly config?: JsonObject;
 };
 type LlmJudgeEvaluatorConfig = {
     readonly name: string;
@@ -247,7 +249,85 @@ type CompositeEvaluatorConfig = {
     readonly aggregator: CompositeAggregatorConfig;
     readonly weight?: number;
 };
-type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig;
+/**
+ * Match type for field accuracy evaluation.
+ * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code_judge evaluator.
+ * See examples/features/document-extraction/fuzzy_match.ts for an example.
+ */
+type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
+/**
+ * Aggregation strategy for combining field scores.
+ */
+type FieldAggregationType = 'weighted_average' | 'all_or_nothing';
+/**
+ * Configuration for a single field to evaluate.
+ */
+type FieldConfig = {
+    /** Dot-notation path to the field (e.g., "invoice.vendor.name" or "items[0].amount") */
+    readonly path: string;
+    /** Match strategy for this field */
+    readonly match: FieldMatchType;
+    /** Whether this field is required (missing required fields count as failures) */
+    readonly required?: boolean;
+    /** Weight for aggregation (default: 1.0) */
+    readonly weight?: number;
+    /** Tolerance for numeric matching (absolute value unless relative is true) */
+    readonly tolerance?: number;
+    /** Whether tolerance is relative (percentage) vs absolute */
+    readonly relative?: boolean;
+    /** Date formats to try when parsing (default: common formats) */
+    readonly formats?: readonly string[];
+};
+/**
+ * Configuration for the field_accuracy evaluator.
+ */
+type FieldAccuracyEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'field_accuracy';
+    /** Fields to compare between candidate and expected */
+    readonly fields: readonly FieldConfig[];
+    /** Strategy for combining field scores (default: weighted_average) */
+    readonly aggregation?: FieldAggregationType;
+    readonly weight?: number;
+};
+/**
+ * Configuration for the latency evaluator.
+ * Checks execution duration against a threshold.
+ */
+type LatencyEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'latency';
+    /** Maximum allowed duration in milliseconds */
+    readonly threshold: number;
+    readonly weight?: number;
+};
+/**
+ * Configuration for the cost evaluator.
+ * Checks execution cost against a budget.
+ */
+type CostEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'cost';
+    /** Maximum allowed cost in USD */
+    readonly budget: number;
+    readonly weight?: number;
+};
+/**
+ * Configuration for the token_usage evaluator.
+ * Checks provider-reported token usage against configured limits.
+ */
+type TokenUsageEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'token_usage';
+    /** Maximum allowed total tokens (input + output + cached, when present) */
+    readonly max_total?: number;
+    /** Maximum allowed input tokens (prompt) */
+    readonly max_input?: number;
+    /** Maximum allowed output tokens (completion) */
+    readonly max_output?: number;
+    readonly weight?: number;
+};
+type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig;
 /**
  * Eval case definition sourced from AgentV specs.
  */
@@ -282,7 +362,6 @@ interface EvaluationResult {
     readonly candidateAnswer: string;
     readonly target: string;
     readonly reasoning?: string;
-    readonly rawAspects?: readonly string[];
     readonly agentProviderRequest?: JsonObject;
     readonly lmProviderRequest?: JsonObject;
     readonly evaluatorProviderRequest?: JsonObject;
@@ -317,7 +396,7 @@ interface ChatMessage {
     readonly name?: string;
 }
 type ChatPrompt = readonly ChatMessage[];
-type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
+type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude-code' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
 interface ProviderRequest {
     readonly question: string;
     readonly systemPrompt?: string;
@@ -726,6 +805,23 @@ interface PiCodingAgentResolvedConfig {
     readonly logFormat?: 'summary' | 'json';
     readonly systemPrompt?: string;
 }
+interface PiAgentSdkResolvedConfig {
+    readonly provider?: string;
+    readonly model?: string;
+    readonly apiKey?: string;
+    readonly timeoutMs?: number;
+    readonly systemPrompt?: string;
+}
+interface ClaudeCodeResolvedConfig {
+    readonly executable: string;
+    readonly model?: string;
+    readonly systemPrompt?: string;
+    readonly args?: readonly string[];
+    readonly cwd?: string;
+    readonly timeoutMs?: number;
+    readonly logDir?: string;
+    readonly logFormat?: 'summary' | 'json';
+}
 interface MockResolvedConfig {
     readonly response?: string;
     readonly delayMs?: number;
@@ -774,6 +870,20 @@ type ResolvedTarget = {
     readonly workers?: number;
     readonly providerBatching?: boolean;
     readonly config: PiCodingAgentResolvedConfig;
+} | {
+    readonly kind: 'pi-agent-sdk';
+    readonly name: string;
+    readonly judgeTarget?: string;
+    readonly workers?: number;
+    readonly providerBatching?: boolean;
+    readonly config: PiAgentSdkResolvedConfig;
+} | {
+    readonly kind: 'claude-code';
+    readonly name: string;
+    readonly judgeTarget?: string;
+    readonly workers?: number;
+    readonly providerBatching?: boolean;
+    readonly config: ClaudeCodeResolvedConfig;
 } | {
     readonly kind: 'mock';
     readonly name: string;
@@ -839,6 +949,16 @@ type PiLogListener = (entry: PiLogEntry) => void;
 declare function consumePiLogEntries(): PiLogEntry[];
 declare function subscribeToPiLogEntries(listener: PiLogListener): () => void;
+type ClaudeCodeLogEntry = {
+    readonly filePath: string;
+    readonly evalCaseId?: string;
+    readonly targetName: string;
+    readonly attempt?: number;
+};
+type ClaudeCodeLogListener = (entry: ClaudeCodeLogEntry) => void;
+declare function consumeClaudeCodeLogEntries(): ClaudeCodeLogEntry[];
+declare function subscribeToClaudeCodeLogEntries(listener: ClaudeCodeLogListener): () => void;
 declare function createProvider(target: ResolvedTarget): Provider;
 declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
@@ -870,7 +990,6 @@ interface EvaluationScore {
     readonly misses: readonly string[];
     readonly expectedAspectCount: number;
     readonly reasoning?: string;
-    readonly rawAspects?: readonly string[];
     readonly evaluatorRawRequest?: JsonObject;
     readonly evaluatorResults?: readonly ChildEvaluatorResult[];
 }
@@ -911,15 +1030,18 @@ declare class LlmJudgeEvaluator implements Evaluator {
     private runWithRetry;
 }
 interface CodeEvaluatorOptions {
-    readonly script: string;
+    readonly script: readonly string[];
     readonly cwd?: string;
     readonly agentTimeoutMs?: number;
+    /** Pass-through configuration from YAML (any unrecognized properties) */
+    readonly config?: Record<string, unknown>;
 }
 declare class CodeEvaluator implements Evaluator {
     readonly kind = "code";
     private readonly script;
     private readonly cwd?;
     private readonly agentTimeoutMs?;
+    private readonly config?;
     constructor(options: CodeEvaluatorOptions);
     evaluate(context: EvaluationContext): Promise<EvaluationScore>;
 }
@@ -943,6 +1065,44 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
     private evaluateInOrder;
     private evaluateExact;
 }
+interface FieldAccuracyEvaluatorOptions {
+    readonly config: FieldAccuracyEvaluatorConfig;
+}
+/**
+ * FieldAccuracyEvaluator compares extracted structured data against expected values
+ * with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
+ */
+declare class FieldAccuracyEvaluator implements Evaluator {
+    readonly kind = "field_accuracy";
+    private readonly config;
+    constructor(options: FieldAccuracyEvaluatorOptions);
+    evaluate(context: EvaluationContext): EvaluationScore;
+    /**
+     * Extract expected data from expected_messages array.
+     * Looks for the last assistant message with content.
+     */
+    private extractExpectedData;
+    /**
+     * Evaluate a single field against the expected value.
+     */
+    private evaluateField;
+    /**
+     * Exact equality comparison.
+     */
+    private compareExact;
+    /**
+     * Numeric comparison with absolute or relative tolerance.
+     */
+    private compareNumericTolerance;
+    /**
+     * Date comparison with format normalization.
+     */
+    private compareDate;
+    /**
+     * Aggregate field results using configured strategy.
+     */
+    private aggregateResults;
+}
 interface EvaluatorFactory {
     create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
 }
@@ -963,6 +1123,45 @@ declare class CompositeEvaluator implements Evaluator {
     private runCodeAggregator;
     private runLlmAggregator;
 }
+interface LatencyEvaluatorOptions {
+    readonly config: LatencyEvaluatorConfig;
+}
+/**
+ * Evaluator that checks execution duration against a threshold.
+ * Uses traceSummary.durationMs from the evaluation context.
+ */
+declare class LatencyEvaluator implements Evaluator {
+    readonly kind = "latency";
+    private readonly config;
+    constructor(options: LatencyEvaluatorOptions);
+    evaluate(context: EvaluationContext): EvaluationScore;
+}
+interface CostEvaluatorOptions {
+    readonly config: CostEvaluatorConfig;
+}
+/**
+ * Evaluator that checks execution cost against a budget.
+ * Uses traceSummary.costUsd from the evaluation context.
+ */
+declare class CostEvaluator implements Evaluator {
+    readonly kind = "cost";
+    private readonly config;
+    constructor(options: CostEvaluatorOptions);
+    evaluate(context: EvaluationContext): EvaluationScore;
+}
+interface TokenUsageEvaluatorOptions {
+    readonly config: TokenUsageEvaluatorConfig;
+}
+/**
+ * Evaluator that checks provider-reported token usage against configured limits.
+ * Uses traceSummary.tokenUsage from the evaluation context.
+ */
+declare class TokenUsageEvaluator implements Evaluator {
+    readonly kind = "token_usage";
+    private readonly config;
+    constructor(options: TokenUsageEvaluatorOptions);
+    evaluate(context: EvaluationContext): EvaluationScore;
+}
 type MaybePromise<T> = T | Promise<T>;
 interface EvaluationCache {
@@ -979,7 +1178,6 @@ interface RunEvalCaseOptions {
     readonly now?: () => Date;
     readonly maxRetries?: number;
     readonly agentTimeoutMs?: number;
-    readonly promptDumpDir?: string;
     readonly cache?: EvaluationCache;
     readonly useCache?: boolean;
     readonly signal?: AbortSignal;
@@ -1003,7 +1201,6 @@ interface RunEvaluationOptions {
     readonly evaluators?: Partial<Record<string, Evaluator>>;
     readonly maxRetries?: number;
     readonly agentTimeoutMs?: number;
-    readonly promptDumpDir?: string;
     readonly cache?: EvaluationCache;
     readonly useCache?: boolean;
     readonly now?: () => Date;
@@ -1033,4 +1230,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
+export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };