@agentv/core 4.9.1 → 4.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-3WGHC7LC.js +149 -0
- package/dist/chunk-3WGHC7LC.js.map +1 -0
- package/dist/{chunk-VCVVKCC4.js → chunk-5POFMJJ7.js} +1 -1
- package/dist/chunk-5POFMJJ7.js.map +1 -0
- package/dist/chunk-SDIANPEY.js +181 -0
- package/dist/chunk-SDIANPEY.js.map +1 -0
- package/dist/docker-workspace-RPPXBT27.js +9 -0
- package/dist/docker-workspace-RPPXBT27.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +70 -3
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +71 -4
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/exec-AR6JUUN5.js +9 -0
- package/dist/exec-AR6JUUN5.js.map +1 -0
- package/dist/index.cjs +1932 -858
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +210 -9
- package/dist/index.d.ts +210 -9
- package/dist/index.js +1366 -651
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-VCVVKCC4.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -571,6 +571,8 @@ type RepoSource = {
|
|
|
571
571
|
};
|
|
572
572
|
type RepoCheckout = {
|
|
573
573
|
readonly ref?: string;
|
|
574
|
+
/** SWE-bench-friendly alias for ref when pinning a dataset snapshot commit */
|
|
575
|
+
readonly base_commit?: string;
|
|
574
576
|
readonly resolve?: 'remote' | 'local';
|
|
575
577
|
readonly ancestor?: number;
|
|
576
578
|
};
|
|
@@ -580,8 +582,10 @@ type RepoClone = {
|
|
|
580
582
|
readonly sparse?: readonly string[];
|
|
581
583
|
};
|
|
582
584
|
type RepoConfig = {
|
|
583
|
-
|
|
584
|
-
readonly
|
|
585
|
+
/** Target path inside the workspace. Optional for Docker repos targeting the container's working directory. */
|
|
586
|
+
readonly path?: string;
|
|
587
|
+
/** Clone source. Optional for Docker prebuilt images where repos exist inside the container. */
|
|
588
|
+
readonly source?: RepoSource;
|
|
585
589
|
readonly checkout?: RepoCheckout;
|
|
586
590
|
readonly clone?: RepoClone;
|
|
587
591
|
};
|
|
@@ -610,6 +614,21 @@ type WorkspaceHooksConfig = {
|
|
|
610
614
|
/** Runs once after final test in the workspace lifecycle */
|
|
611
615
|
readonly after_all?: WorkspaceHookConfig;
|
|
612
616
|
};
|
|
617
|
+
/**
|
|
618
|
+
* Docker-based workspace configuration.
|
|
619
|
+
* When present, code-grader commands run inside a Docker container
|
|
620
|
+
* instead of on the host.
|
|
621
|
+
*/
|
|
622
|
+
type DockerWorkspaceConfig = {
|
|
623
|
+
/** Docker image to use (e.g. 'swebench/sweb.eval.x86_64.django__django-15180') */
|
|
624
|
+
readonly image: string;
|
|
625
|
+
/** Container execution timeout in seconds (default: 1800) */
|
|
626
|
+
readonly timeout?: number;
|
|
627
|
+
/** Memory limit (e.g. '4g', '512m') */
|
|
628
|
+
readonly memory?: string;
|
|
629
|
+
/** CPU limit (e.g. 2, 0.5) */
|
|
630
|
+
readonly cpus?: number;
|
|
631
|
+
};
|
|
613
632
|
type WorkspaceConfig = {
|
|
614
633
|
/** Template directory or .code-workspace file. Directories are copied to temp workspace.
|
|
615
634
|
* .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
|
|
@@ -624,6 +643,8 @@ type WorkspaceConfig = {
|
|
|
624
643
|
readonly mode?: 'pooled' | 'temp' | 'static';
|
|
625
644
|
/** Required when mode=static: use this existing directory directly */
|
|
626
645
|
readonly path?: string;
|
|
646
|
+
/** Docker-based workspace: run grader commands inside a container */
|
|
647
|
+
readonly docker?: DockerWorkspaceConfig;
|
|
627
648
|
};
|
|
628
649
|
type CodeEvaluatorConfig = {
|
|
629
650
|
readonly name: string;
|
|
@@ -644,6 +665,8 @@ type CodeEvaluatorConfig = {
|
|
|
644
665
|
readonly config?: JsonObject;
|
|
645
666
|
/** When present, enables target access via local proxy */
|
|
646
667
|
readonly target?: TargetAccessConfig;
|
|
668
|
+
/** Optional content preprocessors inherited from suite/evaluator config */
|
|
669
|
+
readonly preprocessors?: readonly ContentPreprocessorConfig[];
|
|
647
670
|
};
|
|
648
671
|
/**
|
|
649
672
|
* Executable prompt template configuration.
|
|
@@ -657,6 +680,14 @@ type PromptScriptConfig = {
|
|
|
657
680
|
/** Pass-through configuration for the prompt template */
|
|
658
681
|
readonly config?: Record<string, unknown>;
|
|
659
682
|
};
|
|
683
|
+
type ContentPreprocessorConfig = {
|
|
684
|
+
/** MIME type or short alias such as "xlsx" or "html" */
|
|
685
|
+
readonly type: string;
|
|
686
|
+
/** Command array to execute (stdin JSON payload -> stdout text) */
|
|
687
|
+
readonly command: readonly string[];
|
|
688
|
+
/** Resolved absolute path for the command script (last argv element) */
|
|
689
|
+
readonly resolvedCommand?: readonly string[];
|
|
690
|
+
};
|
|
660
691
|
type LlmGraderEvaluatorConfig = {
|
|
661
692
|
readonly name: string;
|
|
662
693
|
readonly type: 'llm-grader';
|
|
@@ -682,6 +713,8 @@ type LlmGraderEvaluatorConfig = {
|
|
|
682
713
|
readonly max_steps?: number;
|
|
683
714
|
/** Temperature override for grader calls */
|
|
684
715
|
readonly temperature?: number;
|
|
716
|
+
/** Optional content preprocessors for ContentFile blocks in assistant output */
|
|
717
|
+
readonly preprocessors?: readonly ContentPreprocessorConfig[];
|
|
685
718
|
};
|
|
686
719
|
/** @deprecated Use `LlmGraderEvaluatorConfig` instead */
|
|
687
720
|
type LlmJudgeEvaluatorConfig = LlmGraderEvaluatorConfig;
|
|
@@ -1115,6 +1148,8 @@ interface EvalTest {
|
|
|
1115
1148
|
readonly criteria: string;
|
|
1116
1149
|
readonly evaluator?: EvaluatorKind;
|
|
1117
1150
|
readonly assertions?: readonly EvaluatorConfig[];
|
|
1151
|
+
/** Suite-level preprocessors used by the implicit default llm-grader. */
|
|
1152
|
+
readonly preprocessors?: readonly ContentPreprocessorConfig[];
|
|
1118
1153
|
/** Workspace configuration (merged from suite-level and case-level) */
|
|
1119
1154
|
readonly workspace?: WorkspaceConfig;
|
|
1120
1155
|
/** Arbitrary metadata passed to workspace scripts via stdin */
|
|
@@ -1358,10 +1393,19 @@ type ExecutionDefaults = {
|
|
|
1358
1393
|
readonly pool_workspaces?: boolean;
|
|
1359
1394
|
readonly pool_slots?: number;
|
|
1360
1395
|
};
|
|
1396
|
+
type ResultsExportConfig = {
|
|
1397
|
+
readonly repo: string;
|
|
1398
|
+
readonly path: string;
|
|
1399
|
+
readonly auto_push?: boolean;
|
|
1400
|
+
readonly branch_prefix?: string;
|
|
1401
|
+
};
|
|
1361
1402
|
type AgentVConfig$1 = {
|
|
1362
1403
|
readonly required_version?: string;
|
|
1363
1404
|
readonly eval_patterns?: readonly string[];
|
|
1364
1405
|
readonly execution?: ExecutionDefaults;
|
|
1406
|
+
readonly results?: {
|
|
1407
|
+
readonly export?: ResultsExportConfig;
|
|
1408
|
+
};
|
|
1365
1409
|
};
|
|
1366
1410
|
/**
|
|
1367
1411
|
* Load optional .agentv/config.yaml configuration file.
|
|
@@ -2199,6 +2243,8 @@ interface EvaluationContext {
|
|
|
2199
2243
|
readonly fileChanges?: string;
|
|
2200
2244
|
/** Absolute path to the workspace directory (when workspace_template is configured) */
|
|
2201
2245
|
readonly workspacePath?: string;
|
|
2246
|
+
/** Docker workspace config: when present, code-grader commands run inside a container */
|
|
2247
|
+
readonly dockerConfig?: DockerWorkspaceConfig;
|
|
2202
2248
|
}
|
|
2203
2249
|
interface EvaluationScore {
|
|
2204
2250
|
readonly score: number;
|
|
@@ -2492,6 +2538,7 @@ declare class LlmGraderEvaluator implements Evaluator {
|
|
|
2492
2538
|
private readonly graderTargetProvider?;
|
|
2493
2539
|
constructor(options: LlmGraderEvaluatorOptions);
|
|
2494
2540
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
2541
|
+
private prepareContext;
|
|
2495
2542
|
private evaluateFreeform;
|
|
2496
2543
|
private evaluateWithRubrics;
|
|
2497
2544
|
/**
|
|
@@ -2798,9 +2845,9 @@ declare class RepoManager {
|
|
|
2798
2845
|
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
|
|
2799
2846
|
*/
|
|
2800
2847
|
materialize(repo: RepoConfig, workspacePath: string): Promise<void>;
|
|
2801
|
-
/** Materialize all repos into the workspace. */
|
|
2848
|
+
/** Materialize all repos into the workspace. Skips repos without source (Docker-only repos). */
|
|
2802
2849
|
materializeAll(repos: readonly RepoConfig[], workspacePath: string): Promise<void>;
|
|
2803
|
-
/** Reset repos in workspace to their checkout state. */
|
|
2850
|
+
/** Reset repos in workspace to their checkout state. Skips repos without path or source. */
|
|
2804
2851
|
reset(repos: readonly RepoConfig[], workspacePath: string, reset: 'fast' | 'strict'): Promise<void>;
|
|
2805
2852
|
}
|
|
2806
2853
|
|
|
@@ -3232,10 +3279,10 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
3232
3279
|
dir: z.ZodOptional<z.ZodString>;
|
|
3233
3280
|
}, "strip", z.ZodTypeAny, {
|
|
3234
3281
|
dir?: string | undefined;
|
|
3235
|
-
format?: "
|
|
3282
|
+
format?: "json" | "xml" | "yaml" | "jsonl" | undefined;
|
|
3236
3283
|
}, {
|
|
3237
3284
|
dir?: string | undefined;
|
|
3238
|
-
format?: "
|
|
3285
|
+
format?: "json" | "xml" | "yaml" | "jsonl" | undefined;
|
|
3239
3286
|
}>>;
|
|
3240
3287
|
/** Response caching */
|
|
3241
3288
|
cache: z.ZodOptional<z.ZodObject<{
|
|
@@ -3278,7 +3325,7 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
3278
3325
|
} | undefined;
|
|
3279
3326
|
output?: {
|
|
3280
3327
|
dir?: string | undefined;
|
|
3281
|
-
format?: "
|
|
3328
|
+
format?: "json" | "xml" | "yaml" | "jsonl" | undefined;
|
|
3282
3329
|
} | undefined;
|
|
3283
3330
|
limits?: {
|
|
3284
3331
|
maxDurationMs?: number | undefined;
|
|
@@ -3299,7 +3346,7 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
3299
3346
|
} | undefined;
|
|
3300
3347
|
output?: {
|
|
3301
3348
|
dir?: string | undefined;
|
|
3302
|
-
format?: "
|
|
3349
|
+
format?: "json" | "xml" | "yaml" | "jsonl" | undefined;
|
|
3303
3350
|
} | undefined;
|
|
3304
3351
|
limits?: {
|
|
3305
3352
|
maxDurationMs?: number | undefined;
|
|
@@ -3591,6 +3638,106 @@ interface DepsScanResult {
|
|
|
3591
3638
|
*/
|
|
3592
3639
|
declare function scanRepoDeps(evalFilePaths: readonly string[]): Promise<DepsScanResult>;
|
|
3593
3640
|
|
|
3641
|
+
interface RepoCheckoutTarget {
|
|
3642
|
+
readonly path?: string;
|
|
3643
|
+
readonly ref: string;
|
|
3644
|
+
}
|
|
3645
|
+
|
|
3646
|
+
/**
|
|
3647
|
+
* Docker workspace provider — manages Docker container lifecycle for eval grading.
|
|
3648
|
+
*
|
|
3649
|
+
* Flow: pull image → create container → copy files in → exec grader → parse output → destroy container.
|
|
3650
|
+
* All Docker commands use `execFile` (no shell) for security.
|
|
3651
|
+
*
|
|
3652
|
+
* To add a new Docker command: add a method that calls `this.exec(...)` with the appropriate argv.
|
|
3653
|
+
*
|
|
3654
|
+
* Design decisions:
|
|
3655
|
+
* - CommandExecutor interface for testability (mock `execFile` in tests)
|
|
3656
|
+
* - Always `docker rm -f` in cleanup, even on errors (try/finally)
|
|
3657
|
+
* - Lazy-loaded: non-Docker evals never import this module
|
|
3658
|
+
*/
|
|
3659
|
+
|
|
3660
|
+
/** Result of a command execution */
|
|
3661
|
+
interface ExecResult {
|
|
3662
|
+
readonly stdout: string;
|
|
3663
|
+
readonly stderr: string;
|
|
3664
|
+
readonly exitCode: number;
|
|
3665
|
+
}
|
|
3666
|
+
/** Abstraction over process execution for testability */
|
|
3667
|
+
interface CommandExecutor {
|
|
3668
|
+
exec(argv: readonly string[], options?: {
|
|
3669
|
+
timeoutMs?: number;
|
|
3670
|
+
stdin?: string;
|
|
3671
|
+
}): Promise<ExecResult>;
|
|
3672
|
+
}
|
|
3673
|
+
/** Options for creating a Docker container */
|
|
3674
|
+
interface CreateContainerOptions {
|
|
3675
|
+
readonly image: string;
|
|
3676
|
+
readonly memory?: string;
|
|
3677
|
+
readonly cpus?: number;
|
|
3678
|
+
}
|
|
3679
|
+
/** Options for executing a command inside a container */
|
|
3680
|
+
interface ExecInContainerOptions {
|
|
3681
|
+
readonly containerId: string;
|
|
3682
|
+
readonly command: readonly string[];
|
|
3683
|
+
readonly timeoutMs?: number;
|
|
3684
|
+
readonly stdin?: string;
|
|
3685
|
+
}
|
|
3686
|
+
/**
|
|
3687
|
+
* Manages Docker container lifecycle for workspace-based evaluations.
|
|
3688
|
+
*
|
|
3689
|
+
* Usage:
|
|
3690
|
+
* const docker = new DockerWorkspaceProvider(config);
|
|
3691
|
+
* await docker.pullImage();
|
|
3692
|
+
* const containerId = await docker.createContainer();
|
|
3693
|
+
* try {
|
|
3694
|
+
* await docker.copyToContainer(containerId, localPath, containerPath);
|
|
3695
|
+
* const output = await docker.execInContainer({ containerId, command: [...] });
|
|
3696
|
+
* // parse output...
|
|
3697
|
+
* } finally {
|
|
3698
|
+
* await docker.removeContainer(containerId);
|
|
3699
|
+
* }
|
|
3700
|
+
*/
|
|
3701
|
+
declare class DockerWorkspaceProvider {
|
|
3702
|
+
private readonly config;
|
|
3703
|
+
private readonly executor;
|
|
3704
|
+
private readonly timeoutMs;
|
|
3705
|
+
constructor(config: DockerWorkspaceConfig, executor?: CommandExecutor);
|
|
3706
|
+
/** Check whether the Docker CLI is available on the host. */
|
|
3707
|
+
isDockerAvailable(): Promise<boolean>;
|
|
3708
|
+
/** Pull the configured Docker image. No-op if already cached locally. */
|
|
3709
|
+
pullImage(): Promise<void>;
|
|
3710
|
+
/** Create a stopped container from the configured image with resource limits. Returns container ID. */
|
|
3711
|
+
createContainer(): Promise<string>;
|
|
3712
|
+
/** Start a previously created container. */
|
|
3713
|
+
startContainer(containerId: string): Promise<void>;
|
|
3714
|
+
/**
|
|
3715
|
+
* Reset the container checkout to the specified target refs, if any.
|
|
3716
|
+
* This is used for SWE-bench images where the repo state must match the
|
|
3717
|
+
* dataset's base snapshot before grading begins.
|
|
3718
|
+
*/
|
|
3719
|
+
resetContainerCheckout(containerId: string, repoCheckouts?: readonly RepoCheckoutTarget[]): Promise<void>;
|
|
3720
|
+
/** Copy a local file or directory into a running container. */
|
|
3721
|
+
copyToContainer(containerId: string, localPath: string, containerPath: string): Promise<void>;
|
|
3722
|
+
/**
|
|
3723
|
+
* Execute a command inside a running container.
|
|
3724
|
+
* If stdin is provided, it is piped via `docker exec -i`.
|
|
3725
|
+
*/
|
|
3726
|
+
execInContainer(options: ExecInContainerOptions): Promise<ExecResult>;
|
|
3727
|
+
/** Force-remove a container (always succeeds, even if container doesn't exist). */
|
|
3728
|
+
removeContainer(containerId: string): Promise<void>;
|
|
3729
|
+
/** Full lifecycle: create → start → exec → cleanup. Convenience for single-command grading. */
|
|
3730
|
+
runGraderInContainer(options: {
|
|
3731
|
+
readonly command: readonly string[];
|
|
3732
|
+
readonly stdin?: string;
|
|
3733
|
+
readonly copyFiles?: ReadonlyArray<{
|
|
3734
|
+
localPath: string;
|
|
3735
|
+
containerPath: string;
|
|
3736
|
+
}>;
|
|
3737
|
+
readonly repoCheckouts?: readonly RepoCheckoutTarget[];
|
|
3738
|
+
}): Promise<ExecResult>;
|
|
3739
|
+
}
|
|
3740
|
+
|
|
3594
3741
|
/**
|
|
3595
3742
|
* File-based LLM response cache.
|
|
3596
3743
|
* Stores provider responses as JSON files keyed by SHA-256 hash.
|
|
@@ -3647,6 +3794,60 @@ declare function toSnakeCaseDeep(obj: unknown): unknown;
|
|
|
3647
3794
|
*/
|
|
3648
3795
|
declare function toCamelCaseDeep(obj: unknown): unknown;
|
|
3649
3796
|
|
|
3797
|
+
interface ResultsRepoCachePaths {
|
|
3798
|
+
readonly rootDir: string;
|
|
3799
|
+
readonly repoDir: string;
|
|
3800
|
+
readonly statusFile: string;
|
|
3801
|
+
}
|
|
3802
|
+
interface ResultsRepoStatus {
|
|
3803
|
+
readonly configured: boolean;
|
|
3804
|
+
readonly available: boolean;
|
|
3805
|
+
readonly repo?: string;
|
|
3806
|
+
readonly path?: string;
|
|
3807
|
+
readonly auto_push?: boolean;
|
|
3808
|
+
readonly branch_prefix?: string;
|
|
3809
|
+
readonly cache_dir?: string;
|
|
3810
|
+
readonly last_synced_at?: string;
|
|
3811
|
+
readonly last_error?: string;
|
|
3812
|
+
}
|
|
3813
|
+
interface CheckedOutResultsRepoBranch {
|
|
3814
|
+
readonly branchName: string;
|
|
3815
|
+
readonly baseBranch: string;
|
|
3816
|
+
readonly repoDir: string;
|
|
3817
|
+
}
|
|
3818
|
+
interface PreparedResultsRepoBranch extends CheckedOutResultsRepoBranch {
|
|
3819
|
+
readonly cleanup: () => Promise<void>;
|
|
3820
|
+
}
|
|
3821
|
+
declare function normalizeResultsExportConfig(config: ResultsExportConfig): Required<ResultsExportConfig>;
|
|
3822
|
+
declare function resolveResultsRepoUrl(repo: string): string;
|
|
3823
|
+
declare function getResultsRepoCachePaths(repo: string): ResultsRepoCachePaths;
|
|
3824
|
+
declare function ensureResultsRepoClone(config: ResultsExportConfig): Promise<string>;
|
|
3825
|
+
declare function getResultsRepoStatus(config?: ResultsExportConfig): ResultsRepoStatus;
|
|
3826
|
+
declare function syncResultsRepo(config: ResultsExportConfig): Promise<ResultsRepoStatus>;
|
|
3827
|
+
declare function checkoutResultsRepoBranch(config: ResultsExportConfig, branchName: string): Promise<CheckedOutResultsRepoBranch>;
|
|
3828
|
+
declare function prepareResultsRepoBranch(config: ResultsExportConfig, branchName: string): Promise<PreparedResultsRepoBranch>;
|
|
3829
|
+
declare function stageResultsArtifacts(params: {
|
|
3830
|
+
readonly repoDir: string;
|
|
3831
|
+
readonly sourceDir: string;
|
|
3832
|
+
readonly destinationDir: string;
|
|
3833
|
+
}): Promise<void>;
|
|
3834
|
+
declare function resolveResultsRepoRunsDir(config: ResultsExportConfig): string;
|
|
3835
|
+
declare function directorySizeBytes(targetPath: string): Promise<number>;
|
|
3836
|
+
declare function commitAndPushResultsBranch(params: {
|
|
3837
|
+
readonly repoDir: string;
|
|
3838
|
+
readonly branchName: string;
|
|
3839
|
+
readonly commitMessage: string;
|
|
3840
|
+
}): Promise<boolean>;
|
|
3841
|
+
declare function pushResultsRepoBranch(config: ResultsExportConfig, branchName: string, cwd?: string): Promise<void>;
|
|
3842
|
+
declare function createDraftResultsPr(params: {
|
|
3843
|
+
readonly repo: string;
|
|
3844
|
+
readonly repoDir: string;
|
|
3845
|
+
readonly baseBranch: string;
|
|
3846
|
+
readonly branchName: string;
|
|
3847
|
+
readonly title: string;
|
|
3848
|
+
readonly body: string;
|
|
3849
|
+
}): Promise<string>;
|
|
3850
|
+
|
|
3650
3851
|
declare function getAgentvHome(): string;
|
|
3651
3852
|
declare function getWorkspacesRoot(): string;
|
|
3652
3853
|
declare function getSubagentsRoot(): string;
|
|
@@ -4171,4 +4372,4 @@ type AgentKernel = {
|
|
|
4171
4372
|
};
|
|
4172
4373
|
declare function createAgentKernel(): AgentKernel;
|
|
4173
4374
|
|
|
4174
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
4375
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, directorySizeBytes, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|