@agentv/core 2.14.3 → 2.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -514,9 +514,34 @@ type RepoConfig = {
514
514
  readonly checkout?: RepoCheckout;
515
515
  readonly clone?: RepoClone;
516
516
  };
517
- type ResetConfig = {
518
- readonly strategy?: 'none' | 'hard' | 'recreate';
519
- readonly after_each?: boolean;
517
+ type WorkspaceHookConfig = {
518
+ /** Optional command array to execute (e.g., ["bun", "run", "setup.ts"]) */
519
+ readonly command?: readonly string[];
520
+ /** @deprecated Use `command` instead */
521
+ readonly script?: readonly string[];
522
+ /** Optional timeout in milliseconds */
523
+ readonly timeout_ms?: number;
524
+ readonly timeoutMs?: number;
525
+ /** Optional working directory for command execution */
526
+ readonly cwd?: string;
527
+ /** Optional reset policy for this hook */
528
+ readonly reset?: 'none' | 'fast' | 'strict';
529
+ /** Optional cleanup policy for this hook */
530
+ readonly clean?: 'always' | 'on_success' | 'on_failure' | 'never';
531
+ };
532
+ type WorkspaceHooksConfig = {
533
+ /** Runs once before first test in the workspace lifecycle */
534
+ readonly before_all_tests?: WorkspaceHookConfig;
535
+ /** Runs before each test case */
536
+ readonly before_each_test?: WorkspaceHookConfig;
537
+ /** Runs after each test case */
538
+ readonly after_each_test?: WorkspaceHookConfig;
539
+ /** Runs once after final test in the workspace lifecycle */
540
+ readonly after_all_tests?: WorkspaceHookConfig;
541
+ /** Runs when reusing a pooled workspace slot */
542
+ readonly on_reuse?: WorkspaceHookConfig;
543
+ /** Runs/controls behavior when workspace lifecycle finishes */
544
+ readonly on_finish?: WorkspaceHookConfig;
520
545
  };
521
546
  type WorkspaceConfig = {
522
547
  /** Template directory or .code-workspace file. Directories are copied to temp workspace.
@@ -526,16 +551,14 @@ type WorkspaceConfig = {
526
551
  readonly isolation?: 'shared' | 'per_test';
527
552
  /** Repository definitions to clone/checkout into workspace */
528
553
  readonly repos?: readonly RepoConfig[];
529
- /** Reset configuration for repos between test runs */
530
- readonly reset?: ResetConfig;
531
- /** Command to run once before first test (after workspace creation, before git baseline) */
532
- readonly before_all?: WorkspaceScriptConfig;
533
- /** Command to run once after last test (before workspace cleanup) */
534
- readonly after_all?: WorkspaceScriptConfig;
535
- /** Command to run before each test */
536
- readonly before_each?: WorkspaceScriptConfig;
537
- /** Command to run after each test (e.g., git reset for workspace reuse) */
538
- readonly after_each?: WorkspaceScriptConfig;
554
+ /** Workspace lifecycle hooks */
555
+ readonly hooks?: WorkspaceHooksConfig;
556
+ /** Workspace materialization mode */
557
+ readonly mode?: 'pooled' | 'ephemeral' | 'static';
558
+ /** Required when mode=static: use this existing directory directly */
559
+ readonly static_path?: string;
560
+ /** @deprecated Use mode=pooled|ephemeral|static */
561
+ readonly pool?: boolean;
539
562
  };
540
563
  type CodeEvaluatorConfig = {
541
564
  readonly name: string;
@@ -1198,6 +1221,8 @@ type ExecutionDefaults = {
1198
1221
  readonly trace_file?: string;
1199
1222
  readonly keep_workspaces?: boolean;
1200
1223
  readonly otel_file?: string;
1224
+ readonly pool_workspaces?: boolean;
1225
+ readonly pool_slots?: number;
1201
1226
  };
1202
1227
  type AgentVConfig$1 = {
1203
1228
  readonly required_version?: string;
@@ -2345,34 +2370,18 @@ declare class DeterministicAssertionEvaluator implements Evaluator {
2345
2370
  }
2346
2371
 
2347
2372
  declare class RepoManager {
2348
- private readonly cacheDir;
2349
2373
  private readonly verbose;
2350
- constructor(cacheDir?: string, verbose?: boolean);
2374
+ constructor(verbose?: boolean);
2351
2375
  private runGit;
2352
2376
  /**
2353
- * Ensure a bare mirror cache exists for the given source.
2354
- * Creates on first access, fetches updates on subsequent calls.
2355
- * Returns the absolute path to the cache directory.
2356
- */
2357
- ensureCache(source: RepoSource, depth?: number, resolve?: 'remote' | 'local'): Promise<string>;
2358
- /**
2359
- * Clone a repo from cache into the workspace at the configured path.
2377
+ * Clone a repo directly from source into the workspace at the configured path.
2360
2378
  * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
2361
2379
  */
2362
2380
  materialize(repo: RepoConfig, workspacePath: string): Promise<void>;
2363
2381
  /** Materialize all repos into the workspace. */
2364
2382
  materializeAll(repos: readonly RepoConfig[], workspacePath: string): Promise<void>;
2365
2383
  /** Reset repos in workspace to their checkout state. */
2366
- reset(repos: readonly RepoConfig[], workspacePath: string, strategy: 'hard' | 'recreate'): Promise<void>;
2367
- /**
2368
- * Seed the cache from a local repository, setting the remote to a given URL.
2369
- * Useful for avoiding slow network clones when a local clone already exists.
2370
- */
2371
- seedCache(localPath: string, remoteUrl: string, opts?: {
2372
- force?: boolean;
2373
- }): Promise<string>;
2374
- /** Remove the entire cache directory. */
2375
- cleanCache(): Promise<void>;
2384
+ reset(repos: readonly RepoConfig[], workspacePath: string, reset: 'fast' | 'strict'): Promise<void>;
2376
2385
  }
2377
2386
 
2378
2387
  type MaybePromise<T> = T | Promise<T>;
@@ -2404,6 +2413,10 @@ interface RunEvalCaseOptions {
2404
2413
  readonly keepWorkspaces?: boolean;
2405
2414
  /** Force cleanup of workspaces even on failure */
2406
2415
  readonly cleanupWorkspaces?: boolean;
2416
+ /** Retention policy for temp workspaces on successful cases */
2417
+ readonly retainOnSuccess?: 'keep' | 'cleanup';
2418
+ /** Retention policy for temp workspaces on failed cases */
2419
+ readonly retainOnFailure?: 'keep' | 'cleanup';
2407
2420
  /** Pre-created shared workspace path (shared across tests in a suite) */
2408
2421
  readonly sharedWorkspacePath?: string;
2409
2422
  /** Pre-initialized baseline commit for shared workspace */
@@ -2459,6 +2472,22 @@ interface RunEvaluationOptions {
2459
2472
  readonly totalBudgetUsd?: number;
2460
2473
  /** Execution error tolerance: true halts on first error */
2461
2474
  readonly failOnError?: FailOnError;
2475
+ /** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
2476
+ readonly poolWorkspaces?: boolean;
2477
+ /** Maximum number of pool slots on disk (default: 10, max: 50) */
2478
+ readonly poolMaxSlots?: number;
2479
+ /** Pre-existing workspace directory to use directly (skips clone/copy/pool) */
2480
+ readonly workspace?: string;
2481
+ /** Workspace materialization mode override */
2482
+ readonly workspaceMode?: 'pooled' | 'ephemeral' | 'static';
2483
+ /** Static workspace path override (used when workspaceMode=static) */
2484
+ readonly workspacePath?: string;
2485
+ /** Workspace clean policy override for pooled reset */
2486
+ readonly workspaceClean?: 'standard' | 'full';
2487
+ /** Retention policy override for successful cases */
2488
+ readonly retainOnSuccess?: 'keep' | 'cleanup';
2489
+ /** Retention policy override for failed cases */
2490
+ readonly retainOnFailure?: 'keep' | 'cleanup';
2462
2491
  }
2463
2492
  declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
2464
2493
  declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
@@ -2965,6 +2994,82 @@ interface ResolvedWorkspaceTemplate {
2965
2994
  */
2966
2995
  declare function resolveWorkspaceTemplate(templatePath: string | undefined): Promise<ResolvedWorkspaceTemplate | undefined>;
2967
2996
 
2997
+ interface AcquireWorkspaceOptions {
2998
+ templatePath?: string;
2999
+ repos: readonly RepoConfig[];
3000
+ maxSlots: number;
3001
+ repoManager: RepoManager;
3002
+ poolReset?: 'none' | 'fast' | 'strict';
3003
+ }
3004
+ interface PoolSlot {
3005
+ readonly index: number;
3006
+ readonly path: string;
3007
+ readonly isExisting: boolean;
3008
+ readonly lockPath: string;
3009
+ readonly fingerprint: string;
3010
+ readonly poolDir: string;
3011
+ }
3012
+ /**
3013
+ * Compute a deterministic SHA-256 fingerprint for a workspace configuration.
3014
+ * The fingerprint captures template path and all repo configs in a canonical order.
3015
+ */
3016
+ declare function computeWorkspaceFingerprint(templatePath: string | undefined | null, repos: readonly RepoConfig[]): string;
3017
+ /**
3018
+ * Pools entire workspaces (template files + git repos) for reuse across eval runs.
3019
+ *
3020
+ * Pool structure:
3021
+ * ```
3022
+ * {poolRoot}/
3023
+ * {fingerprint}/
3024
+ * metadata.json # fingerprint inputs, creation timestamp
3025
+ * slot-0/ # complete workspace (template files + repos)
3026
+ * slot-0.lock # PID-based lock file
3027
+ * slot-1/ # created on concurrent demand
3028
+ * slot-1.lock
3029
+ * ```
3030
+ */
3031
+ declare class WorkspacePoolManager {
3032
+ private readonly poolRoot;
3033
+ constructor(poolRoot?: string);
3034
+ /**
3035
+ * Acquire a workspace slot from the pool.
3036
+ *
3037
+ * 1. Compute fingerprint from template + repos
3038
+ * 2. Check drift (compare stored metadata.json fingerprint vs computed)
3039
+ * 3. If drift: warn, remove all slots, rematerialize
3040
+ * 4. Acquire a slot (try-lock slot-0, slot-1, ..., up to maxSlots)
3041
+ * 5. If slot exists: reset repos, re-copy template files (skip repo directories)
3042
+ * 6. If new slot: copy template, materialize all repos, write metadata.json
3043
+ * 7. Return the slot (with path, index, isExisting)
3044
+ */
3045
+ acquireWorkspace(options: AcquireWorkspaceOptions): Promise<PoolSlot>;
3046
+ /** Remove lock file to release a slot. */
3047
+ releaseSlot(slot: PoolSlot): Promise<void>;
3048
+ /**
3049
+ * Try to acquire a PID-based lock file.
3050
+ * On EEXIST, read PID and check if process is alive. If dead, stale lock — remove and retry.
3051
+ * Returns true if lock acquired, false if slot is actively locked.
3052
+ * Uses a bounded loop (max 3 attempts) to avoid unbounded recursion.
3053
+ */
3054
+ private tryLock;
3055
+ /**
3056
+ * Check if the stored fingerprint in metadata.json differs from the computed one.
3057
+ * Returns true if drifted, false otherwise.
3058
+ * Returns false (no drift) if metadata.json doesn't exist (first use).
3059
+ */
3060
+ private checkDrift;
3061
+ /** Write metadata.json with fingerprint, inputs, and timestamp. */
3062
+ private writeMetadata;
3063
+ /** Remove all slot directories and their lock files from a pool directory. */
3064
+ private removeAllSlots;
3065
+ /**
3066
+ * Reset an existing slot for reuse:
3067
+ * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
3068
+ * 2. Re-copy template files (skip repo directories)
3069
+ */
3070
+ private resetSlot;
3071
+ }
3072
+
2968
3073
  /**
2969
3074
  * File-based LLM response cache.
2970
3075
  * Stores provider responses as JSON files keyed by SHA-256 hash.
@@ -3023,9 +3128,9 @@ declare function toCamelCaseDeep(obj: unknown): unknown;
3023
3128
 
3024
3129
  declare function getAgentvHome(): string;
3025
3130
  declare function getWorkspacesRoot(): string;
3026
- declare function getGitCacheRoot(): string;
3027
3131
  declare function getSubagentsRoot(): string;
3028
3132
  declare function getTraceStateRoot(): string;
3133
+ declare function getWorkspacePoolRoot(): string;
3029
3134
 
3030
3135
  /**
3031
3136
  * Trims an EvaluationResult for baseline storage.
@@ -3181,4 +3286,4 @@ type AgentKernel = {
3181
3286
  };
3182
3287
  declare function createAgentKernel(): AgentKernel;
3183
3288
 
3184
- export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getGitCacheRoot, getHitCount, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacesRoot, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
3289
+ export { type AcquireWorkspaceOptions, AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getHitCount, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
package/dist/index.d.ts CHANGED
@@ -514,9 +514,34 @@ type RepoConfig = {
514
514
  readonly checkout?: RepoCheckout;
515
515
  readonly clone?: RepoClone;
516
516
  };
517
- type ResetConfig = {
518
- readonly strategy?: 'none' | 'hard' | 'recreate';
519
- readonly after_each?: boolean;
517
+ type WorkspaceHookConfig = {
518
+ /** Optional command array to execute (e.g., ["bun", "run", "setup.ts"]) */
519
+ readonly command?: readonly string[];
520
+ /** @deprecated Use `command` instead */
521
+ readonly script?: readonly string[];
522
+ /** Optional timeout in milliseconds */
523
+ readonly timeout_ms?: number;
524
+ readonly timeoutMs?: number;
525
+ /** Optional working directory for command execution */
526
+ readonly cwd?: string;
527
+ /** Optional reset policy for this hook */
528
+ readonly reset?: 'none' | 'fast' | 'strict';
529
+ /** Optional cleanup policy for this hook */
530
+ readonly clean?: 'always' | 'on_success' | 'on_failure' | 'never';
531
+ };
532
+ type WorkspaceHooksConfig = {
533
+ /** Runs once before first test in the workspace lifecycle */
534
+ readonly before_all_tests?: WorkspaceHookConfig;
535
+ /** Runs before each test case */
536
+ readonly before_each_test?: WorkspaceHookConfig;
537
+ /** Runs after each test case */
538
+ readonly after_each_test?: WorkspaceHookConfig;
539
+ /** Runs once after final test in the workspace lifecycle */
540
+ readonly after_all_tests?: WorkspaceHookConfig;
541
+ /** Runs when reusing a pooled workspace slot */
542
+ readonly on_reuse?: WorkspaceHookConfig;
543
+ /** Runs/controls behavior when workspace lifecycle finishes */
544
+ readonly on_finish?: WorkspaceHookConfig;
520
545
  };
521
546
  type WorkspaceConfig = {
522
547
  /** Template directory or .code-workspace file. Directories are copied to temp workspace.
@@ -526,16 +551,14 @@ type WorkspaceConfig = {
526
551
  readonly isolation?: 'shared' | 'per_test';
527
552
  /** Repository definitions to clone/checkout into workspace */
528
553
  readonly repos?: readonly RepoConfig[];
529
- /** Reset configuration for repos between test runs */
530
- readonly reset?: ResetConfig;
531
- /** Command to run once before first test (after workspace creation, before git baseline) */
532
- readonly before_all?: WorkspaceScriptConfig;
533
- /** Command to run once after last test (before workspace cleanup) */
534
- readonly after_all?: WorkspaceScriptConfig;
535
- /** Command to run before each test */
536
- readonly before_each?: WorkspaceScriptConfig;
537
- /** Command to run after each test (e.g., git reset for workspace reuse) */
538
- readonly after_each?: WorkspaceScriptConfig;
554
+ /** Workspace lifecycle hooks */
555
+ readonly hooks?: WorkspaceHooksConfig;
556
+ /** Workspace materialization mode */
557
+ readonly mode?: 'pooled' | 'ephemeral' | 'static';
558
+ /** Required when mode=static: use this existing directory directly */
559
+ readonly static_path?: string;
560
+ /** @deprecated Use mode=pooled|ephemeral|static */
561
+ readonly pool?: boolean;
539
562
  };
540
563
  type CodeEvaluatorConfig = {
541
564
  readonly name: string;
@@ -1198,6 +1221,8 @@ type ExecutionDefaults = {
1198
1221
  readonly trace_file?: string;
1199
1222
  readonly keep_workspaces?: boolean;
1200
1223
  readonly otel_file?: string;
1224
+ readonly pool_workspaces?: boolean;
1225
+ readonly pool_slots?: number;
1201
1226
  };
1202
1227
  type AgentVConfig$1 = {
1203
1228
  readonly required_version?: string;
@@ -2345,34 +2370,18 @@ declare class DeterministicAssertionEvaluator implements Evaluator {
2345
2370
  }
2346
2371
 
2347
2372
  declare class RepoManager {
2348
- private readonly cacheDir;
2349
2373
  private readonly verbose;
2350
- constructor(cacheDir?: string, verbose?: boolean);
2374
+ constructor(verbose?: boolean);
2351
2375
  private runGit;
2352
2376
  /**
2353
- * Ensure a bare mirror cache exists for the given source.
2354
- * Creates on first access, fetches updates on subsequent calls.
2355
- * Returns the absolute path to the cache directory.
2356
- */
2357
- ensureCache(source: RepoSource, depth?: number, resolve?: 'remote' | 'local'): Promise<string>;
2358
- /**
2359
- * Clone a repo from cache into the workspace at the configured path.
2377
+ * Clone a repo directly from source into the workspace at the configured path.
2360
2378
  * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
2361
2379
  */
2362
2380
  materialize(repo: RepoConfig, workspacePath: string): Promise<void>;
2363
2381
  /** Materialize all repos into the workspace. */
2364
2382
  materializeAll(repos: readonly RepoConfig[], workspacePath: string): Promise<void>;
2365
2383
  /** Reset repos in workspace to their checkout state. */
2366
- reset(repos: readonly RepoConfig[], workspacePath: string, strategy: 'hard' | 'recreate'): Promise<void>;
2367
- /**
2368
- * Seed the cache from a local repository, setting the remote to a given URL.
2369
- * Useful for avoiding slow network clones when a local clone already exists.
2370
- */
2371
- seedCache(localPath: string, remoteUrl: string, opts?: {
2372
- force?: boolean;
2373
- }): Promise<string>;
2374
- /** Remove the entire cache directory. */
2375
- cleanCache(): Promise<void>;
2384
+ reset(repos: readonly RepoConfig[], workspacePath: string, reset: 'fast' | 'strict'): Promise<void>;
2376
2385
  }
2377
2386
 
2378
2387
  type MaybePromise<T> = T | Promise<T>;
@@ -2404,6 +2413,10 @@ interface RunEvalCaseOptions {
2404
2413
  readonly keepWorkspaces?: boolean;
2405
2414
  /** Force cleanup of workspaces even on failure */
2406
2415
  readonly cleanupWorkspaces?: boolean;
2416
+ /** Retention policy for temp workspaces on successful cases */
2417
+ readonly retainOnSuccess?: 'keep' | 'cleanup';
2418
+ /** Retention policy for temp workspaces on failed cases */
2419
+ readonly retainOnFailure?: 'keep' | 'cleanup';
2407
2420
  /** Pre-created shared workspace path (shared across tests in a suite) */
2408
2421
  readonly sharedWorkspacePath?: string;
2409
2422
  /** Pre-initialized baseline commit for shared workspace */
@@ -2459,6 +2472,22 @@ interface RunEvaluationOptions {
2459
2472
  readonly totalBudgetUsd?: number;
2460
2473
  /** Execution error tolerance: true halts on first error */
2461
2474
  readonly failOnError?: FailOnError;
2475
+ /** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
2476
+ readonly poolWorkspaces?: boolean;
2477
+ /** Maximum number of pool slots on disk (default: 10, max: 50) */
2478
+ readonly poolMaxSlots?: number;
2479
+ /** Pre-existing workspace directory to use directly (skips clone/copy/pool) */
2480
+ readonly workspace?: string;
2481
+ /** Workspace materialization mode override */
2482
+ readonly workspaceMode?: 'pooled' | 'ephemeral' | 'static';
2483
+ /** Static workspace path override (used when workspaceMode=static) */
2484
+ readonly workspacePath?: string;
2485
+ /** Workspace clean policy override for pooled reset */
2486
+ readonly workspaceClean?: 'standard' | 'full';
2487
+ /** Retention policy override for successful cases */
2488
+ readonly retainOnSuccess?: 'keep' | 'cleanup';
2489
+ /** Retention policy override for failed cases */
2490
+ readonly retainOnFailure?: 'keep' | 'cleanup';
2462
2491
  }
2463
2492
  declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
2464
2493
  declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
@@ -2965,6 +2994,82 @@ interface ResolvedWorkspaceTemplate {
2965
2994
  */
2966
2995
  declare function resolveWorkspaceTemplate(templatePath: string | undefined): Promise<ResolvedWorkspaceTemplate | undefined>;
2967
2996
 
2997
+ interface AcquireWorkspaceOptions {
2998
+ templatePath?: string;
2999
+ repos: readonly RepoConfig[];
3000
+ maxSlots: number;
3001
+ repoManager: RepoManager;
3002
+ poolReset?: 'none' | 'fast' | 'strict';
3003
+ }
3004
+ interface PoolSlot {
3005
+ readonly index: number;
3006
+ readonly path: string;
3007
+ readonly isExisting: boolean;
3008
+ readonly lockPath: string;
3009
+ readonly fingerprint: string;
3010
+ readonly poolDir: string;
3011
+ }
3012
+ /**
3013
+ * Compute a deterministic SHA-256 fingerprint for a workspace configuration.
3014
+ * The fingerprint captures template path and all repo configs in a canonical order.
3015
+ */
3016
+ declare function computeWorkspaceFingerprint(templatePath: string | undefined | null, repos: readonly RepoConfig[]): string;
3017
+ /**
3018
+ * Pools entire workspaces (template files + git repos) for reuse across eval runs.
3019
+ *
3020
+ * Pool structure:
3021
+ * ```
3022
+ * {poolRoot}/
3023
+ * {fingerprint}/
3024
+ * metadata.json # fingerprint inputs, creation timestamp
3025
+ * slot-0/ # complete workspace (template files + repos)
3026
+ * slot-0.lock # PID-based lock file
3027
+ * slot-1/ # created on concurrent demand
3028
+ * slot-1.lock
3029
+ * ```
3030
+ */
3031
+ declare class WorkspacePoolManager {
3032
+ private readonly poolRoot;
3033
+ constructor(poolRoot?: string);
3034
+ /**
3035
+ * Acquire a workspace slot from the pool.
3036
+ *
3037
+ * 1. Compute fingerprint from template + repos
3038
+ * 2. Check drift (compare stored metadata.json fingerprint vs computed)
3039
+ * 3. If drift: warn, remove all slots, rematerialize
3040
+ * 4. Acquire a slot (try-lock slot-0, slot-1, ..., up to maxSlots)
3041
+ * 5. If slot exists: reset repos, re-copy template files (skip repo directories)
3042
+ * 6. If new slot: copy template, materialize all repos, write metadata.json
3043
+ * 7. Return the slot (with path, index, isExisting)
3044
+ */
3045
+ acquireWorkspace(options: AcquireWorkspaceOptions): Promise<PoolSlot>;
3046
+ /** Remove lock file to release a slot. */
3047
+ releaseSlot(slot: PoolSlot): Promise<void>;
3048
+ /**
3049
+ * Try to acquire a PID-based lock file.
3050
+ * On EEXIST, read PID and check if process is alive. If dead, stale lock — remove and retry.
3051
+ * Returns true if lock acquired, false if slot is actively locked.
3052
+ * Uses a bounded loop (max 3 attempts) to avoid unbounded recursion.
3053
+ */
3054
+ private tryLock;
3055
+ /**
3056
+ * Check if the stored fingerprint in metadata.json differs from the computed one.
3057
+ * Returns true if drifted, false otherwise.
3058
+ * Returns false (no drift) if metadata.json doesn't exist (first use).
3059
+ */
3060
+ private checkDrift;
3061
+ /** Write metadata.json with fingerprint, inputs, and timestamp. */
3062
+ private writeMetadata;
3063
+ /** Remove all slot directories and their lock files from a pool directory. */
3064
+ private removeAllSlots;
3065
+ /**
3066
+ * Reset an existing slot for reuse:
3067
+ * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
3068
+ * 2. Re-copy template files (skip repo directories)
3069
+ */
3070
+ private resetSlot;
3071
+ }
3072
+
2968
3073
  /**
2969
3074
  * File-based LLM response cache.
2970
3075
  * Stores provider responses as JSON files keyed by SHA-256 hash.
@@ -3023,9 +3128,9 @@ declare function toCamelCaseDeep(obj: unknown): unknown;
3023
3128
 
3024
3129
  declare function getAgentvHome(): string;
3025
3130
  declare function getWorkspacesRoot(): string;
3026
- declare function getGitCacheRoot(): string;
3027
3131
  declare function getSubagentsRoot(): string;
3028
3132
  declare function getTraceStateRoot(): string;
3133
+ declare function getWorkspacePoolRoot(): string;
3029
3134
 
3030
3135
  /**
3031
3136
  * Trims an EvaluationResult for baseline storage.
@@ -3181,4 +3286,4 @@ type AgentKernel = {
3181
3286
  };
3182
3287
  declare function createAgentKernel(): AgentKernel;
3183
3288
 
3184
- export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getGitCacheRoot, getHitCount, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacesRoot, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
3289
+ export { type AcquireWorkspaceOptions, AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getHitCount, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };