@agentv/core 4.9.1 → 4.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -571,6 +571,8 @@ type RepoSource = {
571
571
  };
572
572
  type RepoCheckout = {
573
573
  readonly ref?: string;
574
+ /** SWE-bench-friendly alias for ref when pinning a dataset snapshot commit */
575
+ readonly base_commit?: string;
574
576
  readonly resolve?: 'remote' | 'local';
575
577
  readonly ancestor?: number;
576
578
  };
@@ -580,8 +582,10 @@ type RepoClone = {
580
582
  readonly sparse?: readonly string[];
581
583
  };
582
584
  type RepoConfig = {
583
- readonly path: string;
584
- readonly source: RepoSource;
585
+ /** Target path inside the workspace. Optional for Docker repos targeting the container's working directory. */
586
+ readonly path?: string;
587
+ /** Clone source. Optional for Docker prebuilt images where repos exist inside the container. */
588
+ readonly source?: RepoSource;
585
589
  readonly checkout?: RepoCheckout;
586
590
  readonly clone?: RepoClone;
587
591
  };
@@ -610,6 +614,21 @@ type WorkspaceHooksConfig = {
610
614
  /** Runs once after final test in the workspace lifecycle */
611
615
  readonly after_all?: WorkspaceHookConfig;
612
616
  };
617
+ /**
618
+ * Docker-based workspace configuration.
619
+ * When present, code-grader commands run inside a Docker container
620
+ * instead of on the host.
621
+ */
622
+ type DockerWorkspaceConfig = {
623
+ /** Docker image to use (e.g. 'swebench/sweb.eval.x86_64.django__django-15180') */
624
+ readonly image: string;
625
+ /** Container execution timeout in seconds (default: 1800) */
626
+ readonly timeout?: number;
627
+ /** Memory limit (e.g. '4g', '512m') */
628
+ readonly memory?: string;
629
+ /** CPU limit (e.g. 2, 0.5) */
630
+ readonly cpus?: number;
631
+ };
613
632
  type WorkspaceConfig = {
614
633
  /** Template directory or .code-workspace file. Directories are copied to temp workspace.
615
634
  * .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
@@ -624,6 +643,8 @@ type WorkspaceConfig = {
624
643
  readonly mode?: 'pooled' | 'temp' | 'static';
625
644
  /** Required when mode=static: use this existing directory directly */
626
645
  readonly path?: string;
646
+ /** Docker-based workspace: run grader commands inside a container */
647
+ readonly docker?: DockerWorkspaceConfig;
627
648
  };
628
649
  type CodeEvaluatorConfig = {
629
650
  readonly name: string;
@@ -644,6 +665,8 @@ type CodeEvaluatorConfig = {
644
665
  readonly config?: JsonObject;
645
666
  /** When present, enables target access via local proxy */
646
667
  readonly target?: TargetAccessConfig;
668
+ /** Optional content preprocessors inherited from suite/evaluator config */
669
+ readonly preprocessors?: readonly ContentPreprocessorConfig[];
647
670
  };
648
671
  /**
649
672
  * Executable prompt template configuration.
@@ -657,6 +680,14 @@ type PromptScriptConfig = {
657
680
  /** Pass-through configuration for the prompt template */
658
681
  readonly config?: Record<string, unknown>;
659
682
  };
683
+ type ContentPreprocessorConfig = {
684
+ /** MIME type or short alias such as "xlsx" or "html" */
685
+ readonly type: string;
686
+ /** Command array to execute (stdin JSON payload -> stdout text) */
687
+ readonly command: readonly string[];
688
+ /** Resolved absolute path for the command script (last argv element) */
689
+ readonly resolvedCommand?: readonly string[];
690
+ };
660
691
  type LlmGraderEvaluatorConfig = {
661
692
  readonly name: string;
662
693
  readonly type: 'llm-grader';
@@ -682,6 +713,8 @@ type LlmGraderEvaluatorConfig = {
682
713
  readonly max_steps?: number;
683
714
  /** Temperature override for grader calls */
684
715
  readonly temperature?: number;
716
+ /** Optional content preprocessors for ContentFile blocks in assistant output */
717
+ readonly preprocessors?: readonly ContentPreprocessorConfig[];
685
718
  };
686
719
  /** @deprecated Use `LlmGraderEvaluatorConfig` instead */
687
720
  type LlmJudgeEvaluatorConfig = LlmGraderEvaluatorConfig;
@@ -1115,6 +1148,8 @@ interface EvalTest {
1115
1148
  readonly criteria: string;
1116
1149
  readonly evaluator?: EvaluatorKind;
1117
1150
  readonly assertions?: readonly EvaluatorConfig[];
1151
+ /** Suite-level preprocessors used by the implicit default llm-grader. */
1152
+ readonly preprocessors?: readonly ContentPreprocessorConfig[];
1118
1153
  /** Workspace configuration (merged from suite-level and case-level) */
1119
1154
  readonly workspace?: WorkspaceConfig;
1120
1155
  /** Arbitrary metadata passed to workspace scripts via stdin */
@@ -1358,10 +1393,19 @@ type ExecutionDefaults = {
1358
1393
  readonly pool_workspaces?: boolean;
1359
1394
  readonly pool_slots?: number;
1360
1395
  };
1396
+ type ResultsExportConfig = {
1397
+ readonly repo: string;
1398
+ readonly path: string;
1399
+ readonly auto_push?: boolean;
1400
+ readonly branch_prefix?: string;
1401
+ };
1361
1402
  type AgentVConfig$1 = {
1362
1403
  readonly required_version?: string;
1363
1404
  readonly eval_patterns?: readonly string[];
1364
1405
  readonly execution?: ExecutionDefaults;
1406
+ readonly results?: {
1407
+ readonly export?: ResultsExportConfig;
1408
+ };
1365
1409
  };
1366
1410
  /**
1367
1411
  * Load optional .agentv/config.yaml configuration file.
@@ -2199,6 +2243,8 @@ interface EvaluationContext {
2199
2243
  readonly fileChanges?: string;
2200
2244
  /** Absolute path to the workspace directory (when workspace_template is configured) */
2201
2245
  readonly workspacePath?: string;
2246
+ /** Docker workspace config: when present, code-grader commands run inside a container */
2247
+ readonly dockerConfig?: DockerWorkspaceConfig;
2202
2248
  }
2203
2249
  interface EvaluationScore {
2204
2250
  readonly score: number;
@@ -2492,6 +2538,7 @@ declare class LlmGraderEvaluator implements Evaluator {
2492
2538
  private readonly graderTargetProvider?;
2493
2539
  constructor(options: LlmGraderEvaluatorOptions);
2494
2540
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
2541
+ private prepareContext;
2495
2542
  private evaluateFreeform;
2496
2543
  private evaluateWithRubrics;
2497
2544
  /**
@@ -2798,9 +2845,9 @@ declare class RepoManager {
2798
2845
  * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
2799
2846
  */
2800
2847
  materialize(repo: RepoConfig, workspacePath: string): Promise<void>;
2801
- /** Materialize all repos into the workspace. */
2848
+ /** Materialize all repos into the workspace. Skips repos without source (Docker-only repos). */
2802
2849
  materializeAll(repos: readonly RepoConfig[], workspacePath: string): Promise<void>;
2803
- /** Reset repos in workspace to their checkout state. */
2850
+ /** Reset repos in workspace to their checkout state. Skips repos without path or source. */
2804
2851
  reset(repos: readonly RepoConfig[], workspacePath: string, reset: 'fast' | 'strict'): Promise<void>;
2805
2852
  }
2806
2853
 
@@ -3232,10 +3279,10 @@ declare const AgentVConfigSchema: z.ZodObject<{
3232
3279
  dir: z.ZodOptional<z.ZodString>;
3233
3280
  }, "strip", z.ZodTypeAny, {
3234
3281
  dir?: string | undefined;
3235
- format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
3282
+ format?: "json" | "xml" | "yaml" | "jsonl" | undefined;
3236
3283
  }, {
3237
3284
  dir?: string | undefined;
3238
- format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
3285
+ format?: "json" | "xml" | "yaml" | "jsonl" | undefined;
3239
3286
  }>>;
3240
3287
  /** Response caching */
3241
3288
  cache: z.ZodOptional<z.ZodObject<{
@@ -3278,7 +3325,7 @@ declare const AgentVConfigSchema: z.ZodObject<{
3278
3325
  } | undefined;
3279
3326
  output?: {
3280
3327
  dir?: string | undefined;
3281
- format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
3328
+ format?: "json" | "xml" | "yaml" | "jsonl" | undefined;
3282
3329
  } | undefined;
3283
3330
  limits?: {
3284
3331
  maxDurationMs?: number | undefined;
@@ -3299,7 +3346,7 @@ declare const AgentVConfigSchema: z.ZodObject<{
3299
3346
  } | undefined;
3300
3347
  output?: {
3301
3348
  dir?: string | undefined;
3302
- format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
3349
+ format?: "json" | "xml" | "yaml" | "jsonl" | undefined;
3303
3350
  } | undefined;
3304
3351
  limits?: {
3305
3352
  maxDurationMs?: number | undefined;
@@ -3591,6 +3638,106 @@ interface DepsScanResult {
3591
3638
  */
3592
3639
  declare function scanRepoDeps(evalFilePaths: readonly string[]): Promise<DepsScanResult>;
3593
3640
 
3641
+ interface RepoCheckoutTarget {
3642
+ readonly path?: string;
3643
+ readonly ref: string;
3644
+ }
3645
+
3646
+ /**
3647
+ * Docker workspace provider — manages Docker container lifecycle for eval grading.
3648
+ *
3649
+ * Flow: pull image → create container → copy files in → exec grader → parse output → destroy container.
3650
+ * All Docker commands use `execFile` (no shell) for security.
3651
+ *
3652
+ * To add a new Docker command: add a method that calls `this.exec(...)` with the appropriate argv.
3653
+ *
3654
+ * Design decisions:
3655
+ * - CommandExecutor interface for testability (mock `execFile` in tests)
3656
+ * - Always `docker rm -f` in cleanup, even on errors (try/finally)
3657
+ * - Lazy-loaded: non-Docker evals never import this module
3658
+ */
3659
+
3660
+ /** Result of a command execution */
3661
+ interface ExecResult {
3662
+ readonly stdout: string;
3663
+ readonly stderr: string;
3664
+ readonly exitCode: number;
3665
+ }
3666
+ /** Abstraction over process execution for testability */
3667
+ interface CommandExecutor {
3668
+ exec(argv: readonly string[], options?: {
3669
+ timeoutMs?: number;
3670
+ stdin?: string;
3671
+ }): Promise<ExecResult>;
3672
+ }
3673
+ /** Options for creating a Docker container */
3674
+ interface CreateContainerOptions {
3675
+ readonly image: string;
3676
+ readonly memory?: string;
3677
+ readonly cpus?: number;
3678
+ }
3679
+ /** Options for executing a command inside a container */
3680
+ interface ExecInContainerOptions {
3681
+ readonly containerId: string;
3682
+ readonly command: readonly string[];
3683
+ readonly timeoutMs?: number;
3684
+ readonly stdin?: string;
3685
+ }
3686
+ /**
3687
+ * Manages Docker container lifecycle for workspace-based evaluations.
3688
+ *
3689
+ * Usage:
3690
+ * const docker = new DockerWorkspaceProvider(config);
3691
+ * await docker.pullImage();
3692
+ * const containerId = await docker.createContainer();
3693
+ * try {
3694
+ * await docker.copyToContainer(containerId, localPath, containerPath);
3695
+ * const output = await docker.execInContainer({ containerId, command: [...] });
3696
+ * // parse output...
3697
+ * } finally {
3698
+ * await docker.removeContainer(containerId);
3699
+ * }
3700
+ */
3701
+ declare class DockerWorkspaceProvider {
3702
+ private readonly config;
3703
+ private readonly executor;
3704
+ private readonly timeoutMs;
3705
+ constructor(config: DockerWorkspaceConfig, executor?: CommandExecutor);
3706
+ /** Check whether the Docker CLI is available on the host. */
3707
+ isDockerAvailable(): Promise<boolean>;
3708
+ /** Pull the configured Docker image. No-op if already cached locally. */
3709
+ pullImage(): Promise<void>;
3710
+ /** Create a stopped container from the configured image with resource limits. Returns container ID. */
3711
+ createContainer(): Promise<string>;
3712
+ /** Start a previously created container. */
3713
+ startContainer(containerId: string): Promise<void>;
3714
+ /**
3715
+ * Reset the container checkout to the specified target refs, if any.
3716
+ * This is used for SWE-bench images where the repo state must match the
3717
+ * dataset's base snapshot before grading begins.
3718
+ */
3719
+ resetContainerCheckout(containerId: string, repoCheckouts?: readonly RepoCheckoutTarget[]): Promise<void>;
3720
+ /** Copy a local file or directory into a running container. */
3721
+ copyToContainer(containerId: string, localPath: string, containerPath: string): Promise<void>;
3722
+ /**
3723
+ * Execute a command inside a running container.
3724
+ * If stdin is provided, it is piped via `docker exec -i`.
3725
+ */
3726
+ execInContainer(options: ExecInContainerOptions): Promise<ExecResult>;
3727
+ /** Force-remove a container (always succeeds, even if container doesn't exist). */
3728
+ removeContainer(containerId: string): Promise<void>;
3729
+ /** Full lifecycle: create → start → exec → cleanup. Convenience for single-command grading. */
3730
+ runGraderInContainer(options: {
3731
+ readonly command: readonly string[];
3732
+ readonly stdin?: string;
3733
+ readonly copyFiles?: ReadonlyArray<{
3734
+ localPath: string;
3735
+ containerPath: string;
3736
+ }>;
3737
+ readonly repoCheckouts?: readonly RepoCheckoutTarget[];
3738
+ }): Promise<ExecResult>;
3739
+ }
3740
+
3594
3741
  /**
3595
3742
  * File-based LLM response cache.
3596
3743
  * Stores provider responses as JSON files keyed by SHA-256 hash.
@@ -3647,6 +3794,60 @@ declare function toSnakeCaseDeep(obj: unknown): unknown;
3647
3794
  */
3648
3795
  declare function toCamelCaseDeep(obj: unknown): unknown;
3649
3796
 
3797
+ interface ResultsRepoCachePaths {
3798
+ readonly rootDir: string;
3799
+ readonly repoDir: string;
3800
+ readonly statusFile: string;
3801
+ }
3802
+ interface ResultsRepoStatus {
3803
+ readonly configured: boolean;
3804
+ readonly available: boolean;
3805
+ readonly repo?: string;
3806
+ readonly path?: string;
3807
+ readonly auto_push?: boolean;
3808
+ readonly branch_prefix?: string;
3809
+ readonly cache_dir?: string;
3810
+ readonly last_synced_at?: string;
3811
+ readonly last_error?: string;
3812
+ }
3813
+ interface CheckedOutResultsRepoBranch {
3814
+ readonly branchName: string;
3815
+ readonly baseBranch: string;
3816
+ readonly repoDir: string;
3817
+ }
3818
+ interface PreparedResultsRepoBranch extends CheckedOutResultsRepoBranch {
3819
+ readonly cleanup: () => Promise<void>;
3820
+ }
3821
+ declare function normalizeResultsExportConfig(config: ResultsExportConfig): Required<ResultsExportConfig>;
3822
+ declare function resolveResultsRepoUrl(repo: string): string;
3823
+ declare function getResultsRepoCachePaths(repo: string): ResultsRepoCachePaths;
3824
+ declare function ensureResultsRepoClone(config: ResultsExportConfig): Promise<string>;
3825
+ declare function getResultsRepoStatus(config?: ResultsExportConfig): ResultsRepoStatus;
3826
+ declare function syncResultsRepo(config: ResultsExportConfig): Promise<ResultsRepoStatus>;
3827
+ declare function checkoutResultsRepoBranch(config: ResultsExportConfig, branchName: string): Promise<CheckedOutResultsRepoBranch>;
3828
+ declare function prepareResultsRepoBranch(config: ResultsExportConfig, branchName: string): Promise<PreparedResultsRepoBranch>;
3829
+ declare function stageResultsArtifacts(params: {
3830
+ readonly repoDir: string;
3831
+ readonly sourceDir: string;
3832
+ readonly destinationDir: string;
3833
+ }): Promise<void>;
3834
+ declare function resolveResultsRepoRunsDir(config: ResultsExportConfig): string;
3835
+ declare function directorySizeBytes(targetPath: string): Promise<number>;
3836
+ declare function commitAndPushResultsBranch(params: {
3837
+ readonly repoDir: string;
3838
+ readonly branchName: string;
3839
+ readonly commitMessage: string;
3840
+ }): Promise<boolean>;
3841
+ declare function pushResultsRepoBranch(config: ResultsExportConfig, branchName: string, cwd?: string): Promise<void>;
3842
+ declare function createDraftResultsPr(params: {
3843
+ readonly repo: string;
3844
+ readonly repoDir: string;
3845
+ readonly baseBranch: string;
3846
+ readonly branchName: string;
3847
+ readonly title: string;
3848
+ readonly body: string;
3849
+ }): Promise<string>;
3850
+
3650
3851
  declare function getAgentvHome(): string;
3651
3852
  declare function getWorkspacesRoot(): string;
3652
3853
  declare function getSubagentsRoot(): string;
@@ -4171,4 +4372,4 @@ type AgentKernel = {
4171
4372
  };
4172
4373
  declare function createAgentKernel(): AgentKernel;
4173
4374
 
4174
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4375
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, directorySizeBytes, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };