npm - @agentv/core - Versions diffs - 2.2.0 → 2.5.2 - Mend

@agentv/core 2.2.0 → 2.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +77 -77
package/dist/{chunk-KDEP4I7G.js → chunk-RP3M7COZ.js} +1 -1
package/dist/{chunk-KDEP4I7G.js.map → chunk-RP3M7COZ.js.map} +1 -1
package/dist/evaluation/validation/index.cjs +38 -4
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +39 -5
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +654 -119
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +75 -6
package/dist/index.d.ts +75 -6
package/dist/index.js +655 -120
package/dist/index.js.map +1 -1
package/package.json +3 -6

package/dist/index.d.cts CHANGED Viewed

@@ -60,6 +60,8 @@ interface ToolTrajectoryExpectedItem {
     readonly tool: string;
     /** Optional argument matching: 'any' skips validation, object performs partial deep equality */
     readonly args?: 'any' | Record<string, unknown>;
+    /** Optional maximum duration in milliseconds for latency assertions */
+    readonly maxDurationMs?: number;
 }
 /**
  * Simplified input type for computeTraceSummary.
@@ -226,19 +228,71 @@ type CodeEvaluatorConfig = {
     /** When present, enables target access for the script via local proxy */
     readonly target?: TargetAccessConfig;
 };
+/**
+ * Executable prompt template configuration.
+ * Matches code_judge pattern for consistency.
+ */
+type PromptScriptConfig = {
+    /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
+    readonly script: readonly string[];
+    /** Pass-through configuration for the prompt template */
+    readonly config?: Record<string, unknown>;
+};
 type LlmJudgeEvaluatorConfig = {
     readonly name: string;
     readonly type: 'llm_judge';
-    readonly prompt?: string;
+    /** Text prompt (inline or file path) or executable script config */
+    readonly prompt?: string | PromptScriptConfig;
     readonly promptPath?: string;
+    /** Resolved absolute path for prompt file (used for text template prompts) */
+    readonly resolvedPromptPath?: string;
+    /** Resolved script array for executable prompts (matches code_judge pattern) */
+    readonly resolvedPromptScript?: readonly string[];
     readonly rubrics?: readonly RubricItem[];
     readonly weight?: number;
+    /** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
+    readonly config?: Record<string, unknown>;
 };
+/**
+ * Score range definition for analytic rubric scoring.
+ * Each range maps an integer score band (0-10) to an expected outcome description.
+ */
+type ScoreRange = {
+    /** Inclusive integer range [min, max] within 0-10 */
+    readonly score_range: readonly [number, number];
+    /** Description of what this score range represents */
+    readonly expected_outcome: string;
+};
+/**
+ * Rubric item for LLM judge evaluation.
+ * Supports two modes:
+ * - Checklist mode: boolean satisfied/not-satisfied with `expected_outcome`
+ * - Score-range mode: 0-10 integer scoring with `score_ranges`
+ */
 type RubricItem = {
     readonly id: string;
-    readonly description: string;
+    /**
+     * For checklist rubrics: the expected outcome text (required).
+     * For score-range rubrics: optional overall criterion description.
+     */
+    readonly expected_outcome?: string;
     readonly weight: number;
-    readonly required: boolean;
+    /**
+     * Legacy boolean gating (deprecated, treated as required_min_score: 10).
+     * Use required_min_score instead for finer control.
+     */
+    readonly required?: boolean;
+    /**
+     * Minimum score (0-10) required to pass this criterion.
+     * If the criterion score is below this threshold, the overall verdict is 'fail'.
+     */
+    readonly required_min_score?: number;
+    /**
+     * Score range definitions for analytic rubric scoring.
+     * When present, the judge outputs an integer 0-10 score per criterion.
+     * Ranges must be non-overlapping and cover 0-10 inclusive.
+     */
+    readonly score_ranges?: readonly ScoreRange[];
 };
 type CompositeAggregatorConfig = {
     readonly type: 'weighted_average';
@@ -438,6 +492,8 @@ interface ToolCall {
     readonly id?: string;
     /** ISO 8601 timestamp */
     readonly timestamp?: string;
+    /** Duration of the tool call in milliseconds */
+    readonly durationMs?: number;
 }
 /**
  * An output message from agent execution.
@@ -454,6 +510,8 @@ interface OutputMessage {
     readonly toolCalls?: readonly ToolCall[];
     /** ISO 8601 timestamp */
     readonly timestamp?: string;
+    /** Duration of the message in milliseconds */
+    readonly durationMs?: number;
     /** Provider-specific metadata */
     readonly metadata?: Record<string, unknown>;
 }
@@ -608,7 +666,8 @@ declare function detectFormat(filePath: string): 'yaml' | 'jsonl';
 type LoadOptions = {
     readonly verbose?: boolean;
-    readonly evalId?: string;
+    /** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
+    readonly filter?: string;
 };
 /**
  * Read metadata from a test suite file (like target name).
@@ -1192,6 +1251,15 @@ declare class LlmJudgeEvaluator implements Evaluator {
     evaluate(context: EvaluationContext): Promise<EvaluationScore>;
     private evaluateFreeform;
     private evaluateWithRubrics;
+    /**
+     * Evaluate using score-range rubrics (analytic rubric scoring).
+     * Each criterion is scored 0-10 and normalized to 0-1.
+     */
+    private evaluateWithScoreRanges;
+    /**
+     * Build prompt for score-range rubric evaluation.
+     */
+    private buildScoreRangePrompt;
     private buildRubricPrompt;
     private runWithRetry;
 }
@@ -1281,7 +1349,8 @@ interface RunEvaluationOptions {
     readonly cache?: EvaluationCache;
     readonly useCache?: boolean;
     readonly now?: () => Date;
-    readonly evalId?: string;
+    /** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
+    readonly filter?: string;
     readonly verbose?: boolean;
     readonly maxConcurrency?: number;
     readonly evalCases?: readonly EvalCase[];
@@ -1307,4 +1376,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, detectFormat, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
+export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, detectFormat, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };

package/dist/index.d.ts CHANGED Viewed

@@ -60,6 +60,8 @@ interface ToolTrajectoryExpectedItem {
     readonly tool: string;
     /** Optional argument matching: 'any' skips validation, object performs partial deep equality */
     readonly args?: 'any' | Record<string, unknown>;
+    /** Optional maximum duration in milliseconds for latency assertions */
+    readonly maxDurationMs?: number;
 }
 /**
  * Simplified input type for computeTraceSummary.
@@ -226,19 +228,71 @@ type CodeEvaluatorConfig = {
     /** When present, enables target access for the script via local proxy */
     readonly target?: TargetAccessConfig;
 };
+/**
+ * Executable prompt template configuration.
+ * Matches code_judge pattern for consistency.
+ */
+type PromptScriptConfig = {
+    /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
+    readonly script: readonly string[];
+    /** Pass-through configuration for the prompt template */
+    readonly config?: Record<string, unknown>;
+};
 type LlmJudgeEvaluatorConfig = {
     readonly name: string;
     readonly type: 'llm_judge';
-    readonly prompt?: string;
+    /** Text prompt (inline or file path) or executable script config */
+    readonly prompt?: string | PromptScriptConfig;
     readonly promptPath?: string;
+    /** Resolved absolute path for prompt file (used for text template prompts) */
+    readonly resolvedPromptPath?: string;
+    /** Resolved script array for executable prompts (matches code_judge pattern) */
+    readonly resolvedPromptScript?: readonly string[];
     readonly rubrics?: readonly RubricItem[];
     readonly weight?: number;
+    /** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
+    readonly config?: Record<string, unknown>;
 };
+/**
+ * Score range definition for analytic rubric scoring.
+ * Each range maps an integer score band (0-10) to an expected outcome description.
+ */
+type ScoreRange = {
+    /** Inclusive integer range [min, max] within 0-10 */
+    readonly score_range: readonly [number, number];
+    /** Description of what this score range represents */
+    readonly expected_outcome: string;
+};
+/**
+ * Rubric item for LLM judge evaluation.
+ * Supports two modes:
+ * - Checklist mode: boolean satisfied/not-satisfied with `expected_outcome`
+ * - Score-range mode: 0-10 integer scoring with `score_ranges`
+ */
 type RubricItem = {
     readonly id: string;
-    readonly description: string;
+    /**
+     * For checklist rubrics: the expected outcome text (required).
+     * For score-range rubrics: optional overall criterion description.
+     */
+    readonly expected_outcome?: string;
     readonly weight: number;
-    readonly required: boolean;
+    /**
+     * Legacy boolean gating (deprecated, treated as required_min_score: 10).
+     * Use required_min_score instead for finer control.
+     */
+    readonly required?: boolean;
+    /**
+     * Minimum score (0-10) required to pass this criterion.
+     * If the criterion score is below this threshold, the overall verdict is 'fail'.
+     */
+    readonly required_min_score?: number;
+    /**
+     * Score range definitions for analytic rubric scoring.
+     * When present, the judge outputs an integer 0-10 score per criterion.
+     * Ranges must be non-overlapping and cover 0-10 inclusive.
+     */
+    readonly score_ranges?: readonly ScoreRange[];
 };
 type CompositeAggregatorConfig = {
     readonly type: 'weighted_average';
@@ -438,6 +492,8 @@ interface ToolCall {
     readonly id?: string;
     /** ISO 8601 timestamp */
     readonly timestamp?: string;
+    /** Duration of the tool call in milliseconds */
+    readonly durationMs?: number;
 }
 /**
  * An output message from agent execution.
@@ -454,6 +510,8 @@ interface OutputMessage {
     readonly toolCalls?: readonly ToolCall[];
     /** ISO 8601 timestamp */
     readonly timestamp?: string;
+    /** Duration of the message in milliseconds */
+    readonly durationMs?: number;
     /** Provider-specific metadata */
     readonly metadata?: Record<string, unknown>;
 }
@@ -608,7 +666,8 @@ declare function detectFormat(filePath: string): 'yaml' | 'jsonl';
 type LoadOptions = {
     readonly verbose?: boolean;
-    readonly evalId?: string;
+    /** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
+    readonly filter?: string;
 };
 /**
  * Read metadata from a test suite file (like target name).
@@ -1192,6 +1251,15 @@ declare class LlmJudgeEvaluator implements Evaluator {
     evaluate(context: EvaluationContext): Promise<EvaluationScore>;
     private evaluateFreeform;
     private evaluateWithRubrics;
+    /**
+     * Evaluate using score-range rubrics (analytic rubric scoring).
+     * Each criterion is scored 0-10 and normalized to 0-1.
+     */
+    private evaluateWithScoreRanges;
+    /**
+     * Build prompt for score-range rubric evaluation.
+     */
+    private buildScoreRangePrompt;
     private buildRubricPrompt;
     private runWithRetry;
 }
@@ -1281,7 +1349,8 @@ interface RunEvaluationOptions {
     readonly cache?: EvaluationCache;
     readonly useCache?: boolean;
     readonly now?: () => Date;
-    readonly evalId?: string;
+    /** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
+    readonly filter?: string;
     readonly verbose?: boolean;
     readonly maxConcurrency?: number;
     readonly evalCases?: readonly EvalCase[];
@@ -1307,4 +1376,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, detectFormat, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
+export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, detectFormat, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };