@agentv/core 2.9.0-next.2 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-7Q4PH265.js → chunk-REN5PS7B.js} +15 -8
- package/dist/chunk-REN5PS7B.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +106 -8
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +96 -5
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +745 -170
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +173 -9
- package/dist/index.d.ts +173 -9
- package/dist/index.js +710 -150
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-7Q4PH265.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -8,7 +8,7 @@ interface ChatMessage {
|
|
|
8
8
|
readonly name?: string;
|
|
9
9
|
}
|
|
10
10
|
type ChatPrompt = readonly ChatMessage[];
|
|
11
|
-
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
11
|
+
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
12
12
|
/** Callbacks for real-time observability during provider execution */
|
|
13
13
|
interface ProviderStreamCallbacks {
|
|
14
14
|
onToolCallStart?: (toolName: string, toolCallId?: string) => void;
|
|
@@ -453,7 +453,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
453
453
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
454
454
|
*/
|
|
455
455
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
456
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage", "execution_metrics", "agent_judge", "contains", "regex", "is_json", "equals", "rubrics"];
|
|
456
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage", "execution_metrics", "agent_judge", "contains", "contains_any", "contains_all", "icontains", "icontains_any", "icontains_all", "starts_with", "ends_with", "regex", "is_json", "equals", "rubrics"];
|
|
457
457
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
458
458
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
459
459
|
/**
|
|
@@ -491,10 +491,43 @@ type WorkspaceScriptConfig = {
|
|
|
491
491
|
* - before_each: runs before each test (optional)
|
|
492
492
|
* - after_each: runs after each test (e.g., reset git state)
|
|
493
493
|
*/
|
|
494
|
+
type RepoSource = {
|
|
495
|
+
readonly type: 'git';
|
|
496
|
+
readonly url: string;
|
|
497
|
+
} | {
|
|
498
|
+
readonly type: 'local';
|
|
499
|
+
readonly path: string;
|
|
500
|
+
};
|
|
501
|
+
type RepoCheckout = {
|
|
502
|
+
readonly ref?: string;
|
|
503
|
+
readonly resolve?: 'remote' | 'local';
|
|
504
|
+
readonly ancestor?: number;
|
|
505
|
+
};
|
|
506
|
+
type RepoClone = {
|
|
507
|
+
readonly depth?: number;
|
|
508
|
+
readonly filter?: string;
|
|
509
|
+
readonly sparse?: readonly string[];
|
|
510
|
+
};
|
|
511
|
+
type RepoConfig = {
|
|
512
|
+
readonly path: string;
|
|
513
|
+
readonly source: RepoSource;
|
|
514
|
+
readonly checkout?: RepoCheckout;
|
|
515
|
+
readonly clone?: RepoClone;
|
|
516
|
+
};
|
|
517
|
+
type ResetConfig = {
|
|
518
|
+
readonly strategy?: 'none' | 'hard' | 'recreate';
|
|
519
|
+
readonly after_each?: boolean;
|
|
520
|
+
};
|
|
494
521
|
type WorkspaceConfig = {
|
|
495
522
|
/** Template directory or .code-workspace file. Directories are copied to temp workspace.
|
|
496
523
|
* .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
|
|
497
524
|
readonly template?: string;
|
|
525
|
+
/** Isolation strategy for workspace: shared (default) or per_test */
|
|
526
|
+
readonly isolation?: 'shared' | 'per_test';
|
|
527
|
+
/** Repository definitions to clone/checkout into workspace */
|
|
528
|
+
readonly repos?: readonly RepoConfig[];
|
|
529
|
+
/** Reset configuration for repos between test runs */
|
|
530
|
+
readonly reset?: ResetConfig;
|
|
498
531
|
/** Command to run once before first test (after workspace creation, before git baseline) */
|
|
499
532
|
readonly before_all?: WorkspaceScriptConfig;
|
|
500
533
|
/** Command to run once after last test (before workspace cleanup) */
|
|
@@ -777,6 +810,97 @@ type ContainsEvaluatorConfig = {
|
|
|
777
810
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
778
811
|
readonly negate?: boolean;
|
|
779
812
|
};
|
|
813
|
+
/**
|
|
814
|
+
* Configuration for the contains_any assertion evaluator.
|
|
815
|
+
* Checks whether the candidate output contains ANY of the specified substrings.
|
|
816
|
+
*/
|
|
817
|
+
type ContainsAnyEvaluatorConfig = {
|
|
818
|
+
readonly name: string;
|
|
819
|
+
readonly type: 'contains_any';
|
|
820
|
+
readonly value: readonly string[];
|
|
821
|
+
readonly weight?: number;
|
|
822
|
+
readonly required?: boolean | number;
|
|
823
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
824
|
+
readonly negate?: boolean;
|
|
825
|
+
};
|
|
826
|
+
/**
|
|
827
|
+
* Configuration for the contains_all assertion evaluator.
|
|
828
|
+
* Checks whether the candidate output contains ALL of the specified substrings.
|
|
829
|
+
*/
|
|
830
|
+
type ContainsAllEvaluatorConfig = {
|
|
831
|
+
readonly name: string;
|
|
832
|
+
readonly type: 'contains_all';
|
|
833
|
+
readonly value: readonly string[];
|
|
834
|
+
readonly weight?: number;
|
|
835
|
+
readonly required?: boolean | number;
|
|
836
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
837
|
+
readonly negate?: boolean;
|
|
838
|
+
};
|
|
839
|
+
/**
|
|
840
|
+
* Configuration for the icontains assertion evaluator.
|
|
841
|
+
* Case-insensitive check whether the candidate output contains a specified substring.
|
|
842
|
+
*/
|
|
843
|
+
type IcontainsEvaluatorConfig = {
|
|
844
|
+
readonly name: string;
|
|
845
|
+
readonly type: 'icontains';
|
|
846
|
+
readonly value: string;
|
|
847
|
+
readonly weight?: number;
|
|
848
|
+
readonly required?: boolean | number;
|
|
849
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
850
|
+
readonly negate?: boolean;
|
|
851
|
+
};
|
|
852
|
+
/**
|
|
853
|
+
* Configuration for the icontains_any assertion evaluator.
|
|
854
|
+
* Case-insensitive check whether the candidate output contains ANY of the specified substrings.
|
|
855
|
+
*/
|
|
856
|
+
type IcontainsAnyEvaluatorConfig = {
|
|
857
|
+
readonly name: string;
|
|
858
|
+
readonly type: 'icontains_any';
|
|
859
|
+
readonly value: readonly string[];
|
|
860
|
+
readonly weight?: number;
|
|
861
|
+
readonly required?: boolean | number;
|
|
862
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
863
|
+
readonly negate?: boolean;
|
|
864
|
+
};
|
|
865
|
+
/**
|
|
866
|
+
* Configuration for the icontains_all assertion evaluator.
|
|
867
|
+
* Case-insensitive check whether the candidate output contains ALL of the specified substrings.
|
|
868
|
+
*/
|
|
869
|
+
type IcontainsAllEvaluatorConfig = {
|
|
870
|
+
readonly name: string;
|
|
871
|
+
readonly type: 'icontains_all';
|
|
872
|
+
readonly value: readonly string[];
|
|
873
|
+
readonly weight?: number;
|
|
874
|
+
readonly required?: boolean | number;
|
|
875
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
876
|
+
readonly negate?: boolean;
|
|
877
|
+
};
|
|
878
|
+
/**
|
|
879
|
+
* Configuration for the starts_with assertion evaluator.
|
|
880
|
+
* Checks whether the candidate output starts with a specified string (both trimmed).
|
|
881
|
+
*/
|
|
882
|
+
type StartsWithEvaluatorConfig = {
|
|
883
|
+
readonly name: string;
|
|
884
|
+
readonly type: 'starts_with';
|
|
885
|
+
readonly value: string;
|
|
886
|
+
readonly weight?: number;
|
|
887
|
+
readonly required?: boolean | number;
|
|
888
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
889
|
+
readonly negate?: boolean;
|
|
890
|
+
};
|
|
891
|
+
/**
|
|
892
|
+
* Configuration for the ends_with assertion evaluator.
|
|
893
|
+
* Checks whether the candidate output ends with a specified string (both trimmed).
|
|
894
|
+
*/
|
|
895
|
+
type EndsWithEvaluatorConfig = {
|
|
896
|
+
readonly name: string;
|
|
897
|
+
readonly type: 'ends_with';
|
|
898
|
+
readonly value: string;
|
|
899
|
+
readonly weight?: number;
|
|
900
|
+
readonly required?: boolean | number;
|
|
901
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
902
|
+
readonly negate?: boolean;
|
|
903
|
+
};
|
|
780
904
|
/**
|
|
781
905
|
* Configuration for the regex assertion evaluator.
|
|
782
906
|
* Checks whether the candidate output matches a regular expression pattern.
|
|
@@ -785,6 +909,8 @@ type RegexEvaluatorConfig = {
|
|
|
785
909
|
readonly name: string;
|
|
786
910
|
readonly type: 'regex';
|
|
787
911
|
readonly value: string;
|
|
912
|
+
/** Optional regex flags (e.g., "i" for case-insensitive, "m" for multiline) */
|
|
913
|
+
readonly flags?: string;
|
|
788
914
|
readonly weight?: number;
|
|
789
915
|
readonly required?: boolean | number;
|
|
790
916
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
@@ -828,7 +954,7 @@ type RubricsEvaluatorConfig = {
|
|
|
828
954
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
829
955
|
readonly negate?: boolean;
|
|
830
956
|
};
|
|
831
|
-
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | AgentJudgeEvaluatorConfig | ContainsEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig;
|
|
957
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | AgentJudgeEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig;
|
|
832
958
|
/**
|
|
833
959
|
* Eval test definition sourced from AgentV specs.
|
|
834
960
|
*/
|
|
@@ -968,7 +1094,7 @@ interface EvaluationResult {
|
|
|
968
1094
|
/** Whether the evaluation was skipped due to suite-level budget exhaustion */
|
|
969
1095
|
readonly budgetExceeded?: boolean;
|
|
970
1096
|
}
|
|
971
|
-
type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
|
|
1097
|
+
type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip';
|
|
972
1098
|
interface EvaluatorResult {
|
|
973
1099
|
readonly name: string;
|
|
974
1100
|
readonly type: EvaluatorKind;
|
|
@@ -1427,7 +1553,7 @@ type ResolvedTarget = {
|
|
|
1427
1553
|
readonly providerBatching?: boolean;
|
|
1428
1554
|
readonly config: CodexResolvedConfig;
|
|
1429
1555
|
} | {
|
|
1430
|
-
readonly kind: 'copilot';
|
|
1556
|
+
readonly kind: 'copilot-sdk';
|
|
1431
1557
|
readonly name: string;
|
|
1432
1558
|
readonly judgeTarget?: string;
|
|
1433
1559
|
readonly workers?: number;
|
|
@@ -1941,7 +2067,7 @@ declare function buildRubricOutputSchema(): string;
|
|
|
1941
2067
|
declare function substituteVariables(template: string, variables: Record<string, string>): string;
|
|
1942
2068
|
declare function calculateRubricScore(result: z.infer<typeof rubricEvaluationSchema>, rubrics: readonly RubricItem[]): {
|
|
1943
2069
|
score: number;
|
|
1944
|
-
verdict:
|
|
2070
|
+
verdict: EvaluationVerdict;
|
|
1945
2071
|
hits: string[];
|
|
1946
2072
|
misses: string[];
|
|
1947
2073
|
};
|
|
@@ -2072,8 +2198,22 @@ type AssertionResult = {
|
|
|
2072
2198
|
};
|
|
2073
2199
|
/** Checks if `output` contains the given `value` substring. */
|
|
2074
2200
|
declare function runContainsAssertion(output: string, value: string): AssertionResult;
|
|
2075
|
-
/** Checks if `output`
|
|
2076
|
-
declare function
|
|
2201
|
+
/** Checks if `output` contains ANY of the given `values`. */
|
|
2202
|
+
declare function runContainsAnyAssertion(output: string, values: readonly string[]): AssertionResult;
|
|
2203
|
+
/** Checks if `output` contains ALL of the given `values`. */
|
|
2204
|
+
declare function runContainsAllAssertion(output: string, values: readonly string[]): AssertionResult;
|
|
2205
|
+
/** Case-insensitive check if `output` contains `value`. */
|
|
2206
|
+
declare function runIcontainsAssertion(output: string, value: string): AssertionResult;
|
|
2207
|
+
/** Case-insensitive check if `output` contains ANY of the given `values`. */
|
|
2208
|
+
declare function runIcontainsAnyAssertion(output: string, values: readonly string[]): AssertionResult;
|
|
2209
|
+
/** Case-insensitive check if `output` contains ALL of the given `values`. */
|
|
2210
|
+
declare function runIcontainsAllAssertion(output: string, values: readonly string[]): AssertionResult;
|
|
2211
|
+
/** Checks if `output` starts with `value` (both trimmed). */
|
|
2212
|
+
declare function runStartsWithAssertion(output: string, value: string): AssertionResult;
|
|
2213
|
+
/** Checks if `output` ends with `value` (both trimmed). */
|
|
2214
|
+
declare function runEndsWithAssertion(output: string, value: string): AssertionResult;
|
|
2215
|
+
/** Checks if `output` matches the given regex `pattern` with optional `flags`. */
|
|
2216
|
+
declare function runRegexAssertion(output: string, pattern: string, flags?: string): AssertionResult;
|
|
2077
2217
|
/** Checks if `output` is valid JSON. */
|
|
2078
2218
|
declare function runIsJsonAssertion(output: string): AssertionResult;
|
|
2079
2219
|
/** Checks if `output` exactly equals `value` (both trimmed). */
|
|
@@ -2150,6 +2290,28 @@ declare class DeterministicAssertionEvaluator implements Evaluator {
|
|
|
2150
2290
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2151
2291
|
}
|
|
2152
2292
|
|
|
2293
|
+
declare class RepoManager {
|
|
2294
|
+
private readonly cacheDir;
|
|
2295
|
+
constructor(cacheDir?: string);
|
|
2296
|
+
/**
|
|
2297
|
+
* Ensure a bare mirror cache exists for the given source.
|
|
2298
|
+
* Creates on first access, fetches updates on subsequent calls.
|
|
2299
|
+
* Returns the absolute path to the cache directory.
|
|
2300
|
+
*/
|
|
2301
|
+
ensureCache(source: RepoSource): Promise<string>;
|
|
2302
|
+
/**
|
|
2303
|
+
* Clone a repo from cache into the workspace at the configured path.
|
|
2304
|
+
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
|
|
2305
|
+
*/
|
|
2306
|
+
materialize(repo: RepoConfig, workspacePath: string): Promise<void>;
|
|
2307
|
+
/** Materialize all repos into the workspace. */
|
|
2308
|
+
materializeAll(repos: readonly RepoConfig[], workspacePath: string): Promise<void>;
|
|
2309
|
+
/** Reset repos in workspace to their checkout state. */
|
|
2310
|
+
reset(repos: readonly RepoConfig[], workspacePath: string, strategy: 'hard' | 'recreate'): Promise<void>;
|
|
2311
|
+
/** Remove the entire cache directory. */
|
|
2312
|
+
cleanCache(): Promise<void>;
|
|
2313
|
+
}
|
|
2314
|
+
|
|
2153
2315
|
type MaybePromise<T> = T | Promise<T>;
|
|
2154
2316
|
interface EvaluationCache {
|
|
2155
2317
|
get(key: string): MaybePromise<ProviderResponse | undefined>;
|
|
@@ -2189,6 +2351,8 @@ interface RunEvalCaseOptions {
|
|
|
2189
2351
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
2190
2352
|
/** Evaluator type registry (with custom assertions discovered) */
|
|
2191
2353
|
readonly typeRegistry?: EvaluatorRegistry;
|
|
2354
|
+
/** RepoManager instance for repo lifecycle (shared workspace mode) */
|
|
2355
|
+
readonly repoManager?: RepoManager;
|
|
2192
2356
|
}
|
|
2193
2357
|
interface ProgressEvent {
|
|
2194
2358
|
readonly workerId: number;
|
|
@@ -2918,4 +3082,4 @@ type AgentKernel = {
|
|
|
2918
3082
|
};
|
|
2919
3083
|
declare function createAgentKernel(): AgentKernel;
|
|
2920
3084
|
|
|
2921
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
3085
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -8,7 +8,7 @@ interface ChatMessage {
|
|
|
8
8
|
readonly name?: string;
|
|
9
9
|
}
|
|
10
10
|
type ChatPrompt = readonly ChatMessage[];
|
|
11
|
-
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
11
|
+
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
12
12
|
/** Callbacks for real-time observability during provider execution */
|
|
13
13
|
interface ProviderStreamCallbacks {
|
|
14
14
|
onToolCallStart?: (toolName: string, toolCallId?: string) => void;
|
|
@@ -453,7 +453,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
453
453
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
454
454
|
*/
|
|
455
455
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
456
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage", "execution_metrics", "agent_judge", "contains", "regex", "is_json", "equals", "rubrics"];
|
|
456
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage", "execution_metrics", "agent_judge", "contains", "contains_any", "contains_all", "icontains", "icontains_any", "icontains_all", "starts_with", "ends_with", "regex", "is_json", "equals", "rubrics"];
|
|
457
457
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
458
458
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
459
459
|
/**
|
|
@@ -491,10 +491,43 @@ type WorkspaceScriptConfig = {
|
|
|
491
491
|
* - before_each: runs before each test (optional)
|
|
492
492
|
* - after_each: runs after each test (e.g., reset git state)
|
|
493
493
|
*/
|
|
494
|
+
type RepoSource = {
|
|
495
|
+
readonly type: 'git';
|
|
496
|
+
readonly url: string;
|
|
497
|
+
} | {
|
|
498
|
+
readonly type: 'local';
|
|
499
|
+
readonly path: string;
|
|
500
|
+
};
|
|
501
|
+
type RepoCheckout = {
|
|
502
|
+
readonly ref?: string;
|
|
503
|
+
readonly resolve?: 'remote' | 'local';
|
|
504
|
+
readonly ancestor?: number;
|
|
505
|
+
};
|
|
506
|
+
type RepoClone = {
|
|
507
|
+
readonly depth?: number;
|
|
508
|
+
readonly filter?: string;
|
|
509
|
+
readonly sparse?: readonly string[];
|
|
510
|
+
};
|
|
511
|
+
type RepoConfig = {
|
|
512
|
+
readonly path: string;
|
|
513
|
+
readonly source: RepoSource;
|
|
514
|
+
readonly checkout?: RepoCheckout;
|
|
515
|
+
readonly clone?: RepoClone;
|
|
516
|
+
};
|
|
517
|
+
type ResetConfig = {
|
|
518
|
+
readonly strategy?: 'none' | 'hard' | 'recreate';
|
|
519
|
+
readonly after_each?: boolean;
|
|
520
|
+
};
|
|
494
521
|
type WorkspaceConfig = {
|
|
495
522
|
/** Template directory or .code-workspace file. Directories are copied to temp workspace.
|
|
496
523
|
* .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
|
|
497
524
|
readonly template?: string;
|
|
525
|
+
/** Isolation strategy for workspace: shared (default) or per_test */
|
|
526
|
+
readonly isolation?: 'shared' | 'per_test';
|
|
527
|
+
/** Repository definitions to clone/checkout into workspace */
|
|
528
|
+
readonly repos?: readonly RepoConfig[];
|
|
529
|
+
/** Reset configuration for repos between test runs */
|
|
530
|
+
readonly reset?: ResetConfig;
|
|
498
531
|
/** Command to run once before first test (after workspace creation, before git baseline) */
|
|
499
532
|
readonly before_all?: WorkspaceScriptConfig;
|
|
500
533
|
/** Command to run once after last test (before workspace cleanup) */
|
|
@@ -777,6 +810,97 @@ type ContainsEvaluatorConfig = {
|
|
|
777
810
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
778
811
|
readonly negate?: boolean;
|
|
779
812
|
};
|
|
813
|
+
/**
|
|
814
|
+
* Configuration for the contains_any assertion evaluator.
|
|
815
|
+
* Checks whether the candidate output contains ANY of the specified substrings.
|
|
816
|
+
*/
|
|
817
|
+
type ContainsAnyEvaluatorConfig = {
|
|
818
|
+
readonly name: string;
|
|
819
|
+
readonly type: 'contains_any';
|
|
820
|
+
readonly value: readonly string[];
|
|
821
|
+
readonly weight?: number;
|
|
822
|
+
readonly required?: boolean | number;
|
|
823
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
824
|
+
readonly negate?: boolean;
|
|
825
|
+
};
|
|
826
|
+
/**
|
|
827
|
+
* Configuration for the contains_all assertion evaluator.
|
|
828
|
+
* Checks whether the candidate output contains ALL of the specified substrings.
|
|
829
|
+
*/
|
|
830
|
+
type ContainsAllEvaluatorConfig = {
|
|
831
|
+
readonly name: string;
|
|
832
|
+
readonly type: 'contains_all';
|
|
833
|
+
readonly value: readonly string[];
|
|
834
|
+
readonly weight?: number;
|
|
835
|
+
readonly required?: boolean | number;
|
|
836
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
837
|
+
readonly negate?: boolean;
|
|
838
|
+
};
|
|
839
|
+
/**
|
|
840
|
+
* Configuration for the icontains assertion evaluator.
|
|
841
|
+
* Case-insensitive check whether the candidate output contains a specified substring.
|
|
842
|
+
*/
|
|
843
|
+
type IcontainsEvaluatorConfig = {
|
|
844
|
+
readonly name: string;
|
|
845
|
+
readonly type: 'icontains';
|
|
846
|
+
readonly value: string;
|
|
847
|
+
readonly weight?: number;
|
|
848
|
+
readonly required?: boolean | number;
|
|
849
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
850
|
+
readonly negate?: boolean;
|
|
851
|
+
};
|
|
852
|
+
/**
|
|
853
|
+
* Configuration for the icontains_any assertion evaluator.
|
|
854
|
+
* Case-insensitive check whether the candidate output contains ANY of the specified substrings.
|
|
855
|
+
*/
|
|
856
|
+
type IcontainsAnyEvaluatorConfig = {
|
|
857
|
+
readonly name: string;
|
|
858
|
+
readonly type: 'icontains_any';
|
|
859
|
+
readonly value: readonly string[];
|
|
860
|
+
readonly weight?: number;
|
|
861
|
+
readonly required?: boolean | number;
|
|
862
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
863
|
+
readonly negate?: boolean;
|
|
864
|
+
};
|
|
865
|
+
/**
|
|
866
|
+
* Configuration for the icontains_all assertion evaluator.
|
|
867
|
+
* Case-insensitive check whether the candidate output contains ALL of the specified substrings.
|
|
868
|
+
*/
|
|
869
|
+
type IcontainsAllEvaluatorConfig = {
|
|
870
|
+
readonly name: string;
|
|
871
|
+
readonly type: 'icontains_all';
|
|
872
|
+
readonly value: readonly string[];
|
|
873
|
+
readonly weight?: number;
|
|
874
|
+
readonly required?: boolean | number;
|
|
875
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
876
|
+
readonly negate?: boolean;
|
|
877
|
+
};
|
|
878
|
+
/**
|
|
879
|
+
* Configuration for the starts_with assertion evaluator.
|
|
880
|
+
* Checks whether the candidate output starts with a specified string (both trimmed).
|
|
881
|
+
*/
|
|
882
|
+
type StartsWithEvaluatorConfig = {
|
|
883
|
+
readonly name: string;
|
|
884
|
+
readonly type: 'starts_with';
|
|
885
|
+
readonly value: string;
|
|
886
|
+
readonly weight?: number;
|
|
887
|
+
readonly required?: boolean | number;
|
|
888
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
889
|
+
readonly negate?: boolean;
|
|
890
|
+
};
|
|
891
|
+
/**
|
|
892
|
+
* Configuration for the ends_with assertion evaluator.
|
|
893
|
+
* Checks whether the candidate output ends with a specified string (both trimmed).
|
|
894
|
+
*/
|
|
895
|
+
type EndsWithEvaluatorConfig = {
|
|
896
|
+
readonly name: string;
|
|
897
|
+
readonly type: 'ends_with';
|
|
898
|
+
readonly value: string;
|
|
899
|
+
readonly weight?: number;
|
|
900
|
+
readonly required?: boolean | number;
|
|
901
|
+
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
902
|
+
readonly negate?: boolean;
|
|
903
|
+
};
|
|
780
904
|
/**
|
|
781
905
|
* Configuration for the regex assertion evaluator.
|
|
782
906
|
* Checks whether the candidate output matches a regular expression pattern.
|
|
@@ -785,6 +909,8 @@ type RegexEvaluatorConfig = {
|
|
|
785
909
|
readonly name: string;
|
|
786
910
|
readonly type: 'regex';
|
|
787
911
|
readonly value: string;
|
|
912
|
+
/** Optional regex flags (e.g., "i" for case-insensitive, "m" for multiline) */
|
|
913
|
+
readonly flags?: string;
|
|
788
914
|
readonly weight?: number;
|
|
789
915
|
readonly required?: boolean | number;
|
|
790
916
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
@@ -828,7 +954,7 @@ type RubricsEvaluatorConfig = {
|
|
|
828
954
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
829
955
|
readonly negate?: boolean;
|
|
830
956
|
};
|
|
831
|
-
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | AgentJudgeEvaluatorConfig | ContainsEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig;
|
|
957
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | AgentJudgeEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig;
|
|
832
958
|
/**
|
|
833
959
|
* Eval test definition sourced from AgentV specs.
|
|
834
960
|
*/
|
|
@@ -968,7 +1094,7 @@ interface EvaluationResult {
|
|
|
968
1094
|
/** Whether the evaluation was skipped due to suite-level budget exhaustion */
|
|
969
1095
|
readonly budgetExceeded?: boolean;
|
|
970
1096
|
}
|
|
971
|
-
type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
|
|
1097
|
+
type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip';
|
|
972
1098
|
interface EvaluatorResult {
|
|
973
1099
|
readonly name: string;
|
|
974
1100
|
readonly type: EvaluatorKind;
|
|
@@ -1427,7 +1553,7 @@ type ResolvedTarget = {
|
|
|
1427
1553
|
readonly providerBatching?: boolean;
|
|
1428
1554
|
readonly config: CodexResolvedConfig;
|
|
1429
1555
|
} | {
|
|
1430
|
-
readonly kind: 'copilot';
|
|
1556
|
+
readonly kind: 'copilot-sdk';
|
|
1431
1557
|
readonly name: string;
|
|
1432
1558
|
readonly judgeTarget?: string;
|
|
1433
1559
|
readonly workers?: number;
|
|
@@ -1941,7 +2067,7 @@ declare function buildRubricOutputSchema(): string;
|
|
|
1941
2067
|
declare function substituteVariables(template: string, variables: Record<string, string>): string;
|
|
1942
2068
|
declare function calculateRubricScore(result: z.infer<typeof rubricEvaluationSchema>, rubrics: readonly RubricItem[]): {
|
|
1943
2069
|
score: number;
|
|
1944
|
-
verdict:
|
|
2070
|
+
verdict: EvaluationVerdict;
|
|
1945
2071
|
hits: string[];
|
|
1946
2072
|
misses: string[];
|
|
1947
2073
|
};
|
|
@@ -2072,8 +2198,22 @@ type AssertionResult = {
|
|
|
2072
2198
|
};
|
|
2073
2199
|
/** Checks if `output` contains the given `value` substring. */
|
|
2074
2200
|
declare function runContainsAssertion(output: string, value: string): AssertionResult;
|
|
2075
|
-
/** Checks if `output`
|
|
2076
|
-
declare function
|
|
2201
|
+
/** Checks if `output` contains ANY of the given `values`. */
|
|
2202
|
+
declare function runContainsAnyAssertion(output: string, values: readonly string[]): AssertionResult;
|
|
2203
|
+
/** Checks if `output` contains ALL of the given `values`. */
|
|
2204
|
+
declare function runContainsAllAssertion(output: string, values: readonly string[]): AssertionResult;
|
|
2205
|
+
/** Case-insensitive check if `output` contains `value`. */
|
|
2206
|
+
declare function runIcontainsAssertion(output: string, value: string): AssertionResult;
|
|
2207
|
+
/** Case-insensitive check if `output` contains ANY of the given `values`. */
|
|
2208
|
+
declare function runIcontainsAnyAssertion(output: string, values: readonly string[]): AssertionResult;
|
|
2209
|
+
/** Case-insensitive check if `output` contains ALL of the given `values`. */
|
|
2210
|
+
declare function runIcontainsAllAssertion(output: string, values: readonly string[]): AssertionResult;
|
|
2211
|
+
/** Checks if `output` starts with `value` (both trimmed). */
|
|
2212
|
+
declare function runStartsWithAssertion(output: string, value: string): AssertionResult;
|
|
2213
|
+
/** Checks if `output` ends with `value` (both trimmed). */
|
|
2214
|
+
declare function runEndsWithAssertion(output: string, value: string): AssertionResult;
|
|
2215
|
+
/** Checks if `output` matches the given regex `pattern` with optional `flags`. */
|
|
2216
|
+
declare function runRegexAssertion(output: string, pattern: string, flags?: string): AssertionResult;
|
|
2077
2217
|
/** Checks if `output` is valid JSON. */
|
|
2078
2218
|
declare function runIsJsonAssertion(output: string): AssertionResult;
|
|
2079
2219
|
/** Checks if `output` exactly equals `value` (both trimmed). */
|
|
@@ -2150,6 +2290,28 @@ declare class DeterministicAssertionEvaluator implements Evaluator {
|
|
|
2150
2290
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2151
2291
|
}
|
|
2152
2292
|
|
|
2293
|
+
declare class RepoManager {
|
|
2294
|
+
private readonly cacheDir;
|
|
2295
|
+
constructor(cacheDir?: string);
|
|
2296
|
+
/**
|
|
2297
|
+
* Ensure a bare mirror cache exists for the given source.
|
|
2298
|
+
* Creates on first access, fetches updates on subsequent calls.
|
|
2299
|
+
* Returns the absolute path to the cache directory.
|
|
2300
|
+
*/
|
|
2301
|
+
ensureCache(source: RepoSource): Promise<string>;
|
|
2302
|
+
/**
|
|
2303
|
+
* Clone a repo from cache into the workspace at the configured path.
|
|
2304
|
+
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
|
|
2305
|
+
*/
|
|
2306
|
+
materialize(repo: RepoConfig, workspacePath: string): Promise<void>;
|
|
2307
|
+
/** Materialize all repos into the workspace. */
|
|
2308
|
+
materializeAll(repos: readonly RepoConfig[], workspacePath: string): Promise<void>;
|
|
2309
|
+
/** Reset repos in workspace to their checkout state. */
|
|
2310
|
+
reset(repos: readonly RepoConfig[], workspacePath: string, strategy: 'hard' | 'recreate'): Promise<void>;
|
|
2311
|
+
/** Remove the entire cache directory. */
|
|
2312
|
+
cleanCache(): Promise<void>;
|
|
2313
|
+
}
|
|
2314
|
+
|
|
2153
2315
|
type MaybePromise<T> = T | Promise<T>;
|
|
2154
2316
|
interface EvaluationCache {
|
|
2155
2317
|
get(key: string): MaybePromise<ProviderResponse | undefined>;
|
|
@@ -2189,6 +2351,8 @@ interface RunEvalCaseOptions {
|
|
|
2189
2351
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
2190
2352
|
/** Evaluator type registry (with custom assertions discovered) */
|
|
2191
2353
|
readonly typeRegistry?: EvaluatorRegistry;
|
|
2354
|
+
/** RepoManager instance for repo lifecycle (shared workspace mode) */
|
|
2355
|
+
readonly repoManager?: RepoManager;
|
|
2192
2356
|
}
|
|
2193
2357
|
interface ProgressEvent {
|
|
2194
2358
|
readonly workerId: number;
|
|
@@ -2918,4 +3082,4 @@ type AgentKernel = {
|
|
|
2918
3082
|
};
|
|
2919
3083
|
declare function createAgentKernel(): AgentKernel;
|
|
2920
3084
|
|
|
2921
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIsJsonAssertion, runRegexAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
3085
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|