@agentv/core 2.12.0 → 2.14.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -257,7 +257,7 @@ interface TraceComputeResult {
257
257
  readonly endTime?: string;
258
258
  }
259
259
  /**
260
- * Argument matching mode for tool_trajectory expected items.
260
+ * Argument matching mode for tool-trajectory expected items.
261
261
  * - 'exact': bidirectional deep equality, no extra keys allowed (default)
262
262
  * - 'superset': actual args must contain all expected keys (extras OK)
263
263
  * - 'subset': actual args must be a subset of expected keys (no unexpected keys)
@@ -265,11 +265,11 @@ interface TraceComputeResult {
265
265
  */
266
266
  type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
267
267
  /**
268
- * Configuration for tool_trajectory evaluator.
268
+ * Configuration for tool-trajectory evaluator.
269
269
  */
270
270
  interface ToolTrajectoryEvaluatorConfig {
271
271
  readonly name: string;
272
- readonly type: 'tool_trajectory';
272
+ readonly type: 'tool-trajectory';
273
273
  /** Matching mode */
274
274
  readonly mode: 'any_order' | 'in_order' | 'exact' | 'subset' | 'superset';
275
275
  /** Minimum call counts per tool (for any_order mode) */
@@ -453,11 +453,11 @@ declare function isJsonValue(value: unknown): value is JsonValue;
453
453
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
454
454
  */
455
455
  declare function isTestMessage(value: unknown): value is TestMessage;
456
- declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage", "execution_metrics", "agent_judge", "contains", "contains_any", "contains_all", "icontains", "icontains_any", "icontains_all", "starts_with", "ends_with", "regex", "is_json", "equals", "rubrics"];
456
+ declare const EVALUATOR_KIND_VALUES: readonly ["code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "agent-judge", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics"];
457
457
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
458
458
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
459
459
  /**
460
- * Configuration for enabling target access in code_judge evaluators.
460
+ * Configuration for enabling target access in code-judge evaluators.
461
461
  * When present, the runtime will start a local proxy server that allows
462
462
  * the script to invoke configured targets without direct credential access.
463
463
  */
@@ -539,7 +539,7 @@ type WorkspaceConfig = {
539
539
  };
540
540
  type CodeEvaluatorConfig = {
541
541
  readonly name: string;
542
- readonly type: 'code';
542
+ readonly type: 'code-judge';
543
543
  readonly command: readonly string[];
544
544
  /** @deprecated Use `command` instead */
545
545
  readonly script?: readonly string[];
@@ -550,14 +550,14 @@ type CodeEvaluatorConfig = {
550
550
  readonly required?: boolean | number;
551
551
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
552
552
  readonly negate?: boolean;
553
- /** Pass-through configuration for the code_judge (any unrecognized YAML properties) */
553
+ /** Pass-through configuration for the code-judge (any unrecognized YAML properties) */
554
554
  readonly config?: JsonObject;
555
555
  /** When present, enables target access via local proxy */
556
556
  readonly target?: TargetAccessConfig;
557
557
  };
558
558
  /**
559
559
  * Executable prompt template configuration.
560
- * Matches code_judge pattern for consistency.
560
+ * Matches code-judge pattern for consistency.
561
561
  */
562
562
  type PromptScriptConfig = {
563
563
  /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
@@ -569,13 +569,13 @@ type PromptScriptConfig = {
569
569
  };
570
570
  type LlmJudgeEvaluatorConfig = {
571
571
  readonly name: string;
572
- readonly type: 'llm_judge';
572
+ readonly type: 'llm-judge';
573
573
  /** Text prompt (inline or file path) or executable script config */
574
574
  readonly prompt?: string | PromptScriptConfig;
575
575
  readonly promptPath?: string;
576
576
  /** Resolved absolute path for prompt file (used for text template prompts) */
577
577
  readonly resolvedPromptPath?: string;
578
- /** Resolved script array for executable prompts (matches code_judge pattern) */
578
+ /** Resolved script array for executable prompts (matches code-judge pattern) */
579
579
  readonly resolvedPromptScript?: readonly string[];
580
580
  readonly rubrics?: readonly RubricItem[];
581
581
  readonly weight?: number;
@@ -630,11 +630,11 @@ type CompositeAggregatorConfig = {
630
630
  readonly type: 'weighted_average';
631
631
  readonly weights?: Record<string, number>;
632
632
  } | {
633
- readonly type: 'code_judge';
633
+ readonly type: 'code-judge';
634
634
  readonly path: string;
635
635
  readonly cwd?: string;
636
636
  } | {
637
- readonly type: 'llm_judge';
637
+ readonly type: 'llm-judge';
638
638
  readonly prompt?: string;
639
639
  readonly promptPath?: string;
640
640
  readonly model?: string;
@@ -654,7 +654,7 @@ type CompositeEvaluatorConfig = {
654
654
  };
655
655
  /**
656
656
  * Match type for field accuracy evaluation.
657
- * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code_judge evaluator.
657
+ * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-judge evaluator.
658
658
  * See examples/features/document-extraction/fuzzy_match.ts for an example.
659
659
  */
660
660
  type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
@@ -682,11 +682,11 @@ type FieldConfig = {
682
682
  readonly formats?: readonly string[];
683
683
  };
684
684
  /**
685
- * Configuration for the field_accuracy evaluator.
685
+ * Configuration for the field-accuracy evaluator.
686
686
  */
687
687
  type FieldAccuracyEvaluatorConfig = {
688
688
  readonly name: string;
689
- readonly type: 'field_accuracy';
689
+ readonly type: 'field-accuracy';
690
690
  /** Fields to compare between candidate and expected */
691
691
  readonly fields: readonly FieldConfig[];
692
692
  /** Strategy for combining field scores (default: weighted_average) */
@@ -725,12 +725,12 @@ type CostEvaluatorConfig = {
725
725
  readonly negate?: boolean;
726
726
  };
727
727
  /**
728
- * Configuration for the token_usage evaluator.
728
+ * Configuration for the token-usage evaluator.
729
729
  * Checks provider-reported token usage against configured limits.
730
730
  */
731
731
  type TokenUsageEvaluatorConfig = {
732
732
  readonly name: string;
733
- readonly type: 'token_usage';
733
+ readonly type: 'token-usage';
734
734
  /** Maximum allowed total tokens (input + output + cached, when present) */
735
735
  readonly max_total?: number;
736
736
  /** Maximum allowed input tokens (prompt) */
@@ -743,13 +743,13 @@ type TokenUsageEvaluatorConfig = {
743
743
  readonly negate?: boolean;
744
744
  };
745
745
  /**
746
- * Configuration for the execution_metrics evaluator.
746
+ * Configuration for the execution-metrics evaluator.
747
747
  * Provides declarative threshold-based checks on execution metrics.
748
748
  * Only specified thresholds are checked; omitted ones are ignored.
749
749
  */
750
750
  type ExecutionMetricsEvaluatorConfig = {
751
751
  readonly name: string;
752
- readonly type: 'execution_metrics';
752
+ readonly type: 'execution-metrics';
753
753
  /** Maximum allowed number of tool calls */
754
754
  readonly max_tool_calls?: number;
755
755
  /** Maximum allowed number of LLM calls (assistant messages) */
@@ -770,7 +770,7 @@ type ExecutionMetricsEvaluatorConfig = {
770
770
  readonly negate?: boolean;
771
771
  };
772
772
  /**
773
- * Configuration for the agent_judge evaluator.
773
+ * Configuration for the agent-judge evaluator.
774
774
  * Runs an agentic investigation loop to audit workspaces and verify criteria.
775
775
  * Two modes:
776
776
  * - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
@@ -778,13 +778,13 @@ type ExecutionMetricsEvaluatorConfig = {
778
778
  */
779
779
  type AgentJudgeEvaluatorConfig = {
780
780
  readonly name: string;
781
- readonly type: 'agent_judge';
781
+ readonly type: 'agent-judge';
782
782
  /** Custom evaluation prompt (inline text or file path) */
783
783
  readonly prompt?: string;
784
784
  readonly promptPath?: string;
785
785
  /** Resolved absolute path for prompt file */
786
786
  readonly resolvedPromptPath?: string;
787
- /** Rubric items for structured evaluation (reuses llm_judge rubric infra) */
787
+ /** Rubric items for structured evaluation (reuses llm-judge rubric infra) */
788
788
  readonly rubrics?: readonly RubricItem[];
789
789
  /** Maximum agent steps for built-in mode (default 10, max 50) */
790
790
  readonly max_steps?: number;
@@ -816,7 +816,7 @@ type ContainsEvaluatorConfig = {
816
816
  */
817
817
  type ContainsAnyEvaluatorConfig = {
818
818
  readonly name: string;
819
- readonly type: 'contains_any';
819
+ readonly type: 'contains-any';
820
820
  readonly value: readonly string[];
821
821
  readonly weight?: number;
822
822
  readonly required?: boolean | number;
@@ -829,7 +829,7 @@ type ContainsAnyEvaluatorConfig = {
829
829
  */
830
830
  type ContainsAllEvaluatorConfig = {
831
831
  readonly name: string;
832
- readonly type: 'contains_all';
832
+ readonly type: 'contains-all';
833
833
  readonly value: readonly string[];
834
834
  readonly weight?: number;
835
835
  readonly required?: boolean | number;
@@ -855,7 +855,7 @@ type IcontainsEvaluatorConfig = {
855
855
  */
856
856
  type IcontainsAnyEvaluatorConfig = {
857
857
  readonly name: string;
858
- readonly type: 'icontains_any';
858
+ readonly type: 'icontains-any';
859
859
  readonly value: readonly string[];
860
860
  readonly weight?: number;
861
861
  readonly required?: boolean | number;
@@ -868,7 +868,7 @@ type IcontainsAnyEvaluatorConfig = {
868
868
  */
869
869
  type IcontainsAllEvaluatorConfig = {
870
870
  readonly name: string;
871
- readonly type: 'icontains_all';
871
+ readonly type: 'icontains-all';
872
872
  readonly value: readonly string[];
873
873
  readonly weight?: number;
874
874
  readonly required?: boolean | number;
@@ -881,7 +881,7 @@ type IcontainsAllEvaluatorConfig = {
881
881
  */
882
882
  type StartsWithEvaluatorConfig = {
883
883
  readonly name: string;
884
- readonly type: 'starts_with';
884
+ readonly type: 'starts-with';
885
885
  readonly value: string;
886
886
  readonly weight?: number;
887
887
  readonly required?: boolean | number;
@@ -894,7 +894,7 @@ type StartsWithEvaluatorConfig = {
894
894
  */
895
895
  type EndsWithEvaluatorConfig = {
896
896
  readonly name: string;
897
- readonly type: 'ends_with';
897
+ readonly type: 'ends-with';
898
898
  readonly value: string;
899
899
  readonly weight?: number;
900
900
  readonly required?: boolean | number;
@@ -922,7 +922,7 @@ type RegexEvaluatorConfig = {
922
922
  */
923
923
  type IsJsonEvaluatorConfig = {
924
924
  readonly name: string;
925
- readonly type: 'is_json';
925
+ readonly type: 'is-json';
926
926
  readonly weight?: number;
927
927
  readonly required?: boolean | number;
928
928
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
@@ -1060,6 +1060,12 @@ interface ExecutionError {
1060
1060
  readonly message: string;
1061
1061
  readonly stage: FailureStage;
1062
1062
  }
1063
+ /**
1064
+ * Tolerance for execution errors in an eval run.
1065
+ * - `true`: halt on first execution error
1066
+ * - `false`: never halt on errors (default)
1067
+ */
1068
+ type FailOnError = boolean;
1063
1069
  /**
1064
1070
  * Evaluator scorecard for a single eval case run.
1065
1071
  */
@@ -1194,6 +1200,7 @@ type ExecutionDefaults = {
1194
1200
  readonly otel_file?: string;
1195
1201
  };
1196
1202
  type AgentVConfig$1 = {
1203
+ readonly required_version?: string;
1197
1204
  readonly guideline_patterns?: readonly string[];
1198
1205
  readonly eval_patterns?: readonly string[];
1199
1206
  readonly execution?: ExecutionDefaults;
@@ -1238,6 +1245,12 @@ interface CacheConfig {
1238
1245
  * Returns undefined when no cache config is specified.
1239
1246
  */
1240
1247
  declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
1248
+ /**
1249
+ * Extract `execution.fail_on_error` from parsed eval suite.
1250
+ * Accepts `true` or `false`.
1251
+ * Returns undefined when not specified.
1252
+ */
1253
+ declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
1241
1254
 
1242
1255
  /**
1243
1256
  * Formatting mode for segment content.
@@ -1297,6 +1310,8 @@ type EvalSuiteResult = {
1297
1310
  readonly metadata?: EvalMetadata;
1298
1311
  /** Suite-level total cost budget in USD */
1299
1312
  readonly totalBudgetUsd?: number;
1313
+ /** Execution error tolerance: true or false */
1314
+ readonly failOnError?: FailOnError;
1300
1315
  };
1301
1316
  /**
1302
1317
  * Load tests and suite metadata from a single parse.
@@ -1900,7 +1915,7 @@ interface CodeEvaluatorOptions {
1900
1915
  readonly target?: TargetAccessConfig;
1901
1916
  }
1902
1917
  declare class CodeEvaluator implements Evaluator {
1903
- readonly kind = "code";
1918
+ readonly kind = "code-judge";
1904
1919
  private readonly command;
1905
1920
  private readonly cwd?;
1906
1921
  private readonly agentTimeoutMs?;
@@ -1955,7 +1970,7 @@ interface ExecutionMetricsEvaluatorOptions {
1955
1970
  * Score is proportional: hits.length / (hits.length + misses.length)
1956
1971
  */
1957
1972
  declare class ExecutionMetricsEvaluator implements Evaluator {
1958
- readonly kind = "execution_metrics";
1973
+ readonly kind = "execution-metrics";
1959
1974
  private readonly config;
1960
1975
  constructor(options: ExecutionMetricsEvaluatorOptions);
1961
1976
  evaluate(context: EvaluationContext): EvaluationScore;
@@ -1971,7 +1986,7 @@ interface FieldAccuracyEvaluatorOptions {
1971
1986
  * with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
1972
1987
  */
1973
1988
  declare class FieldAccuracyEvaluator implements Evaluator {
1974
- readonly kind = "field_accuracy";
1989
+ readonly kind = "field-accuracy";
1975
1990
  private readonly config;
1976
1991
  constructor(options: FieldAccuracyEvaluatorOptions);
1977
1992
  evaluate(context: EvaluationContext): EvaluationScore;
@@ -2076,7 +2091,7 @@ declare const rubricEvaluationSchema: z.ZodObject<{
2076
2091
  }>;
2077
2092
 
2078
2093
  declare class LlmJudgeEvaluator implements Evaluator {
2079
- readonly kind = "llm_judge";
2094
+ readonly kind = "llm-judge";
2080
2095
  private readonly resolveJudgeProvider;
2081
2096
  private readonly maxOutputTokens?;
2082
2097
  private readonly temperature?;
@@ -2123,7 +2138,7 @@ interface AgentJudgeEvaluatorOptions {
2123
2138
  readonly judgeTargetProvider?: Provider;
2124
2139
  }
2125
2140
  declare class AgentJudgeEvaluator implements Evaluator {
2126
- readonly kind = "agent_judge";
2141
+ readonly kind = "agent-judge";
2127
2142
  private readonly resolveJudgeProvider;
2128
2143
  private readonly maxSteps;
2129
2144
  private readonly temperature;
@@ -2185,7 +2200,7 @@ interface TokenUsageEvaluatorOptions {
2185
2200
  * Uses tokenUsage from the evaluation context.
2186
2201
  */
2187
2202
  declare class TokenUsageEvaluator implements Evaluator {
2188
- readonly kind = "token_usage";
2203
+ readonly kind = "token-usage";
2189
2204
  private readonly config;
2190
2205
  constructor(options: TokenUsageEvaluatorOptions);
2191
2206
  evaluate(context: EvaluationContext): EvaluationScore;
@@ -2195,7 +2210,7 @@ interface ToolTrajectoryEvaluatorOptions {
2195
2210
  readonly config: ToolTrajectoryEvaluatorConfig;
2196
2211
  }
2197
2212
  declare class ToolTrajectoryEvaluator implements Evaluator {
2198
- readonly kind = "tool_trajectory";
2213
+ readonly kind = "tool-trajectory";
2199
2214
  private readonly config;
2200
2215
  constructor(options: ToolTrajectoryEvaluatorOptions);
2201
2216
  evaluate(context: EvaluationContext): EvaluationScore;
@@ -2320,7 +2335,7 @@ declare class EvaluatorRegistry {
2320
2335
  }
2321
2336
  /**
2322
2337
  * Adapter that wraps a synchronous assertion function as an Evaluator.
2323
- * Used for deterministic assertions (contains, regex, is_json, equals).
2338
+ * Used for deterministic assertions (contains, regex, is-json, equals).
2324
2339
  */
2325
2340
  declare class DeterministicAssertionEvaluator implements Evaluator {
2326
2341
  private readonly assertFn;
@@ -2368,7 +2383,7 @@ interface RunEvalCaseOptions {
2368
2383
  readonly provider: Provider;
2369
2384
  readonly target: ResolvedTarget;
2370
2385
  readonly evaluators: Partial<Record<string, Evaluator>> & {
2371
- readonly llm_judge: Evaluator;
2386
+ readonly 'llm-judge': Evaluator;
2372
2387
  };
2373
2388
  readonly now?: () => Date;
2374
2389
  readonly maxRetries?: number;
@@ -2440,6 +2455,8 @@ interface RunEvaluationOptions {
2440
2455
  readonly streamCallbacks?: ProviderStreamCallbacks;
2441
2456
  /** Suite-level total cost budget in USD (stops dispatching when exceeded) */
2442
2457
  readonly totalBudgetUsd?: number;
2458
+ /** Execution error tolerance: true halts on first error */
2459
+ readonly failOnError?: FailOnError;
2443
2460
  }
2444
2461
  declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
2445
2462
  declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
@@ -2507,7 +2524,7 @@ interface EvalTestInput {
2507
2524
  * Matches the YAML `assert` block structure.
2508
2525
  */
2509
2526
  interface EvalAssertionInput {
2510
- /** Assertion type (e.g., 'contains', 'llm_judge', 'code_judge') */
2527
+ /** Assertion type (e.g., 'contains', 'llm-judge', 'code-judge') */
2511
2528
  readonly type: string;
2512
2529
  /** Display name */
2513
2530
  readonly name?: string;
@@ -3135,7 +3152,7 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
3135
3152
  * Convention-based discovery of custom assertion scripts.
3136
3153
  *
3137
3154
  * Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
3138
- * them as code_judge evaluators in the registry. The file name (without
3155
+ * them as code-judge evaluators in the registry. The file name (without
3139
3156
  * extension) becomes the evaluator type name.
3140
3157
  *
3141
3158
  * Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
@@ -3156,4 +3173,4 @@ type AgentKernel = {
3156
3173
  };
3157
3174
  declare function createAgentKernel(): AgentKernel;
3158
3175
 
3159
- export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
3176
+ export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };