@agentv/core 2.2.0 → 2.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -60,6 +60,8 @@ interface ToolTrajectoryExpectedItem {
60
60
  readonly tool: string;
61
61
  /** Optional argument matching: 'any' skips validation, object performs partial deep equality */
62
62
  readonly args?: 'any' | Record<string, unknown>;
63
+ /** Optional maximum duration in milliseconds for latency assertions */
64
+ readonly maxDurationMs?: number;
63
65
  }
64
66
  /**
65
67
  * Simplified input type for computeTraceSummary.
@@ -226,19 +228,71 @@ type CodeEvaluatorConfig = {
226
228
  /** When present, enables target access for the script via local proxy */
227
229
  readonly target?: TargetAccessConfig;
228
230
  };
231
+ /**
232
+ * Executable prompt template configuration.
233
+ * Matches code_judge pattern for consistency.
234
+ */
235
+ type PromptScriptConfig = {
236
+ /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
237
+ readonly script: readonly string[];
238
+ /** Pass-through configuration for the prompt template */
239
+ readonly config?: Record<string, unknown>;
240
+ };
229
241
  type LlmJudgeEvaluatorConfig = {
230
242
  readonly name: string;
231
243
  readonly type: 'llm_judge';
232
- readonly prompt?: string;
244
+ /** Text prompt (inline or file path) or executable script config */
245
+ readonly prompt?: string | PromptScriptConfig;
233
246
  readonly promptPath?: string;
247
+ /** Resolved absolute path for prompt file (used for text template prompts) */
248
+ readonly resolvedPromptPath?: string;
249
+ /** Resolved script array for executable prompts (matches code_judge pattern) */
250
+ readonly resolvedPromptScript?: readonly string[];
234
251
  readonly rubrics?: readonly RubricItem[];
235
252
  readonly weight?: number;
253
+ /** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
254
+ readonly config?: Record<string, unknown>;
236
255
  };
256
+ /**
257
+ * Score range definition for analytic rubric scoring.
258
+ * Each range maps an integer score band (0-10) to an expected outcome description.
259
+ */
260
+ type ScoreRange = {
261
+ /** Inclusive integer range [min, max] within 0-10 */
262
+ readonly score_range: readonly [number, number];
263
+ /** Description of what this score range represents */
264
+ readonly expected_outcome: string;
265
+ };
266
+ /**
267
+ * Rubric item for LLM judge evaluation.
268
+ * Supports two modes:
269
+ * - Checklist mode: boolean satisfied/not-satisfied with `expected_outcome`
270
+ * - Score-range mode: 0-10 integer scoring with `score_ranges`
271
+ */
237
272
  type RubricItem = {
238
273
  readonly id: string;
239
- readonly description: string;
274
+ /**
275
+ * For checklist rubrics: the expected outcome text (required).
276
+ * For score-range rubrics: optional overall criterion description.
277
+ */
278
+ readonly expected_outcome?: string;
240
279
  readonly weight: number;
241
- readonly required: boolean;
280
+ /**
281
+ * Legacy boolean gating (deprecated, treated as required_min_score: 10).
282
+ * Use required_min_score instead for finer control.
283
+ */
284
+ readonly required?: boolean;
285
+ /**
286
+ * Minimum score (0-10) required to pass this criterion.
287
+ * If the criterion score is below this threshold, the overall verdict is 'fail'.
288
+ */
289
+ readonly required_min_score?: number;
290
+ /**
291
+ * Score range definitions for analytic rubric scoring.
292
+ * When present, the judge outputs an integer 0-10 score per criterion.
293
+ * Ranges must be non-overlapping and cover 0-10 inclusive.
294
+ */
295
+ readonly score_ranges?: readonly ScoreRange[];
242
296
  };
243
297
  type CompositeAggregatorConfig = {
244
298
  readonly type: 'weighted_average';
@@ -438,6 +492,8 @@ interface ToolCall {
438
492
  readonly id?: string;
439
493
  /** ISO 8601 timestamp */
440
494
  readonly timestamp?: string;
495
+ /** Duration of the tool call in milliseconds */
496
+ readonly durationMs?: number;
441
497
  }
442
498
  /**
443
499
  * An output message from agent execution.
@@ -454,6 +510,8 @@ interface OutputMessage {
454
510
  readonly toolCalls?: readonly ToolCall[];
455
511
  /** ISO 8601 timestamp */
456
512
  readonly timestamp?: string;
513
+ /** Duration of the message in milliseconds */
514
+ readonly durationMs?: number;
457
515
  /** Provider-specific metadata */
458
516
  readonly metadata?: Record<string, unknown>;
459
517
  }
@@ -608,7 +666,8 @@ declare function detectFormat(filePath: string): 'yaml' | 'jsonl';
608
666
 
609
667
  type LoadOptions = {
610
668
  readonly verbose?: boolean;
611
- readonly evalId?: string;
669
+ /** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
670
+ readonly filter?: string;
612
671
  };
613
672
  /**
614
673
  * Read metadata from a test suite file (like target name).
@@ -1192,6 +1251,15 @@ declare class LlmJudgeEvaluator implements Evaluator {
1192
1251
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
1193
1252
  private evaluateFreeform;
1194
1253
  private evaluateWithRubrics;
1254
+ /**
1255
+ * Evaluate using score-range rubrics (analytic rubric scoring).
1256
+ * Each criterion is scored 0-10 and normalized to 0-1.
1257
+ */
1258
+ private evaluateWithScoreRanges;
1259
+ /**
1260
+ * Build prompt for score-range rubric evaluation.
1261
+ */
1262
+ private buildScoreRangePrompt;
1195
1263
  private buildRubricPrompt;
1196
1264
  private runWithRetry;
1197
1265
  }
@@ -1281,7 +1349,8 @@ interface RunEvaluationOptions {
1281
1349
  readonly cache?: EvaluationCache;
1282
1350
  readonly useCache?: boolean;
1283
1351
  readonly now?: () => Date;
1284
- readonly evalId?: string;
1352
+ /** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
1353
+ readonly filter?: string;
1285
1354
  readonly verbose?: boolean;
1286
1355
  readonly maxConcurrency?: number;
1287
1356
  readonly evalCases?: readonly EvalCase[];
@@ -1307,4 +1376,4 @@ type AgentKernel = {
1307
1376
  };
1308
1377
  declare function createAgentKernel(): AgentKernel;
1309
1378
 
1310
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, detectFormat, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
1379
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, detectFormat, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
package/dist/index.d.ts CHANGED
@@ -60,6 +60,8 @@ interface ToolTrajectoryExpectedItem {
60
60
  readonly tool: string;
61
61
  /** Optional argument matching: 'any' skips validation, object performs partial deep equality */
62
62
  readonly args?: 'any' | Record<string, unknown>;
63
+ /** Optional maximum duration in milliseconds for latency assertions */
64
+ readonly maxDurationMs?: number;
63
65
  }
64
66
  /**
65
67
  * Simplified input type for computeTraceSummary.
@@ -226,19 +228,71 @@ type CodeEvaluatorConfig = {
226
228
  /** When present, enables target access for the script via local proxy */
227
229
  readonly target?: TargetAccessConfig;
228
230
  };
231
+ /**
232
+ * Executable prompt template configuration.
233
+ * Matches code_judge pattern for consistency.
234
+ */
235
+ type PromptScriptConfig = {
236
+ /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
237
+ readonly script: readonly string[];
238
+ /** Pass-through configuration for the prompt template */
239
+ readonly config?: Record<string, unknown>;
240
+ };
229
241
  type LlmJudgeEvaluatorConfig = {
230
242
  readonly name: string;
231
243
  readonly type: 'llm_judge';
232
- readonly prompt?: string;
244
+ /** Text prompt (inline or file path) or executable script config */
245
+ readonly prompt?: string | PromptScriptConfig;
233
246
  readonly promptPath?: string;
247
+ /** Resolved absolute path for prompt file (used for text template prompts) */
248
+ readonly resolvedPromptPath?: string;
249
+ /** Resolved script array for executable prompts (matches code_judge pattern) */
250
+ readonly resolvedPromptScript?: readonly string[];
234
251
  readonly rubrics?: readonly RubricItem[];
235
252
  readonly weight?: number;
253
+ /** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
254
+ readonly config?: Record<string, unknown>;
236
255
  };
256
+ /**
257
+ * Score range definition for analytic rubric scoring.
258
+ * Each range maps an integer score band (0-10) to an expected outcome description.
259
+ */
260
+ type ScoreRange = {
261
+ /** Inclusive integer range [min, max] within 0-10 */
262
+ readonly score_range: readonly [number, number];
263
+ /** Description of what this score range represents */
264
+ readonly expected_outcome: string;
265
+ };
266
+ /**
267
+ * Rubric item for LLM judge evaluation.
268
+ * Supports two modes:
269
+ * - Checklist mode: boolean satisfied/not-satisfied with `expected_outcome`
270
+ * - Score-range mode: 0-10 integer scoring with `score_ranges`
271
+ */
237
272
  type RubricItem = {
238
273
  readonly id: string;
239
- readonly description: string;
274
+ /**
275
+ * For checklist rubrics: the expected outcome text (required).
276
+ * For score-range rubrics: optional overall criterion description.
277
+ */
278
+ readonly expected_outcome?: string;
240
279
  readonly weight: number;
241
- readonly required: boolean;
280
+ /**
281
+ * Legacy boolean gating (deprecated, treated as required_min_score: 10).
282
+ * Use required_min_score instead for finer control.
283
+ */
284
+ readonly required?: boolean;
285
+ /**
286
+ * Minimum score (0-10) required to pass this criterion.
287
+ * If the criterion score is below this threshold, the overall verdict is 'fail'.
288
+ */
289
+ readonly required_min_score?: number;
290
+ /**
291
+ * Score range definitions for analytic rubric scoring.
292
+ * When present, the judge outputs an integer 0-10 score per criterion.
293
+ * Ranges must be non-overlapping and cover 0-10 inclusive.
294
+ */
295
+ readonly score_ranges?: readonly ScoreRange[];
242
296
  };
243
297
  type CompositeAggregatorConfig = {
244
298
  readonly type: 'weighted_average';
@@ -438,6 +492,8 @@ interface ToolCall {
438
492
  readonly id?: string;
439
493
  /** ISO 8601 timestamp */
440
494
  readonly timestamp?: string;
495
+ /** Duration of the tool call in milliseconds */
496
+ readonly durationMs?: number;
441
497
  }
442
498
  /**
443
499
  * An output message from agent execution.
@@ -454,6 +510,8 @@ interface OutputMessage {
454
510
  readonly toolCalls?: readonly ToolCall[];
455
511
  /** ISO 8601 timestamp */
456
512
  readonly timestamp?: string;
513
+ /** Duration of the message in milliseconds */
514
+ readonly durationMs?: number;
457
515
  /** Provider-specific metadata */
458
516
  readonly metadata?: Record<string, unknown>;
459
517
  }
@@ -608,7 +666,8 @@ declare function detectFormat(filePath: string): 'yaml' | 'jsonl';
608
666
 
609
667
  type LoadOptions = {
610
668
  readonly verbose?: boolean;
611
- readonly evalId?: string;
669
+ /** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
670
+ readonly filter?: string;
612
671
  };
613
672
  /**
614
673
  * Read metadata from a test suite file (like target name).
@@ -1192,6 +1251,15 @@ declare class LlmJudgeEvaluator implements Evaluator {
1192
1251
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
1193
1252
  private evaluateFreeform;
1194
1253
  private evaluateWithRubrics;
1254
+ /**
1255
+ * Evaluate using score-range rubrics (analytic rubric scoring).
1256
+ * Each criterion is scored 0-10 and normalized to 0-1.
1257
+ */
1258
+ private evaluateWithScoreRanges;
1259
+ /**
1260
+ * Build prompt for score-range rubric evaluation.
1261
+ */
1262
+ private buildScoreRangePrompt;
1195
1263
  private buildRubricPrompt;
1196
1264
  private runWithRetry;
1197
1265
  }
@@ -1281,7 +1349,8 @@ interface RunEvaluationOptions {
1281
1349
  readonly cache?: EvaluationCache;
1282
1350
  readonly useCache?: boolean;
1283
1351
  readonly now?: () => Date;
1284
- readonly evalId?: string;
1352
+ /** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
1353
+ readonly filter?: string;
1285
1354
  readonly verbose?: boolean;
1286
1355
  readonly maxConcurrency?: number;
1287
1356
  readonly evalCases?: readonly EvalCase[];
@@ -1307,4 +1376,4 @@ type AgentKernel = {
1307
1376
  };
1308
1377
  declare function createAgentKernel(): AgentKernel;
1309
1378
 
1310
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, detectFormat, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
1379
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, detectFormat, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };