@agentv/core 2.14.2 → 2.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1270 -604
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +94 -2
- package/dist/index.d.ts +94 -2
- package/dist/index.js +1242 -584
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -1198,6 +1198,8 @@ type ExecutionDefaults = {
|
|
|
1198
1198
|
readonly trace_file?: string;
|
|
1199
1199
|
readonly keep_workspaces?: boolean;
|
|
1200
1200
|
readonly otel_file?: string;
|
|
1201
|
+
readonly pool_workspaces?: boolean;
|
|
1202
|
+
readonly pool_slots?: number;
|
|
1201
1203
|
};
|
|
1202
1204
|
type AgentVConfig$1 = {
|
|
1203
1205
|
readonly required_version?: string;
|
|
@@ -2346,7 +2348,9 @@ declare class DeterministicAssertionEvaluator implements Evaluator {
|
|
|
2346
2348
|
|
|
2347
2349
|
declare class RepoManager {
|
|
2348
2350
|
private readonly cacheDir;
|
|
2349
|
-
|
|
2351
|
+
private readonly verbose;
|
|
2352
|
+
constructor(cacheDir?: string, verbose?: boolean);
|
|
2353
|
+
private runGit;
|
|
2350
2354
|
/**
|
|
2351
2355
|
* Ensure a bare mirror cache exists for the given source.
|
|
2352
2356
|
* Creates on first access, fetches updates on subsequent calls.
|
|
@@ -2457,6 +2461,12 @@ interface RunEvaluationOptions {
|
|
|
2457
2461
|
readonly totalBudgetUsd?: number;
|
|
2458
2462
|
/** Execution error tolerance: true halts on first error */
|
|
2459
2463
|
readonly failOnError?: FailOnError;
|
|
2464
|
+
/** Opt-in: reuse materialized workspaces across eval runs */
|
|
2465
|
+
readonly poolWorkspaces?: boolean;
|
|
2466
|
+
/** Maximum number of pool slots on disk (default: 10, max: 50) */
|
|
2467
|
+
readonly poolMaxSlots?: number;
|
|
2468
|
+
/** Pre-existing workspace directory to use directly (skips clone/copy/pool) */
|
|
2469
|
+
readonly workspace?: string;
|
|
2460
2470
|
}
|
|
2461
2471
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
2462
2472
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
@@ -2963,6 +2973,81 @@ interface ResolvedWorkspaceTemplate {
|
|
|
2963
2973
|
*/
|
|
2964
2974
|
declare function resolveWorkspaceTemplate(templatePath: string | undefined): Promise<ResolvedWorkspaceTemplate | undefined>;
|
|
2965
2975
|
|
|
2976
|
+
interface AcquireWorkspaceOptions {
|
|
2977
|
+
templatePath?: string;
|
|
2978
|
+
repos: readonly RepoConfig[];
|
|
2979
|
+
maxSlots: number;
|
|
2980
|
+
repoManager: RepoManager;
|
|
2981
|
+
}
|
|
2982
|
+
interface PoolSlot {
|
|
2983
|
+
readonly index: number;
|
|
2984
|
+
readonly path: string;
|
|
2985
|
+
readonly isExisting: boolean;
|
|
2986
|
+
readonly lockPath: string;
|
|
2987
|
+
readonly fingerprint: string;
|
|
2988
|
+
readonly poolDir: string;
|
|
2989
|
+
}
|
|
2990
|
+
/**
|
|
2991
|
+
* Compute a deterministic SHA-256 fingerprint for a workspace configuration.
|
|
2992
|
+
* The fingerprint captures template path and all repo configs in a canonical order.
|
|
2993
|
+
*/
|
|
2994
|
+
declare function computeWorkspaceFingerprint(templatePath: string | undefined | null, repos: readonly RepoConfig[]): string;
|
|
2995
|
+
/**
|
|
2996
|
+
* Pools entire workspaces (template files + git repos) for reuse across eval runs.
|
|
2997
|
+
*
|
|
2998
|
+
* Pool structure:
|
|
2999
|
+
* ```
|
|
3000
|
+
* {poolRoot}/
|
|
3001
|
+
* {fingerprint}/
|
|
3002
|
+
* metadata.json # fingerprint inputs, creation timestamp
|
|
3003
|
+
* slot-0/ # complete workspace (template files + repos)
|
|
3004
|
+
* slot-0.lock # PID-based lock file
|
|
3005
|
+
* slot-1/ # created on concurrent demand
|
|
3006
|
+
* slot-1.lock
|
|
3007
|
+
* ```
|
|
3008
|
+
*/
|
|
3009
|
+
declare class WorkspacePoolManager {
|
|
3010
|
+
private readonly poolRoot;
|
|
3011
|
+
constructor(poolRoot?: string);
|
|
3012
|
+
/**
|
|
3013
|
+
* Acquire a workspace slot from the pool.
|
|
3014
|
+
*
|
|
3015
|
+
* 1. Compute fingerprint from template + repos
|
|
3016
|
+
* 2. Check drift (compare stored metadata.json fingerprint vs computed)
|
|
3017
|
+
* 3. If drift: warn, remove all slots, rematerialize
|
|
3018
|
+
* 4. Acquire a slot (try-lock slot-0, slot-1, ..., up to maxSlots)
|
|
3019
|
+
* 5. If slot exists: reset repos, re-copy template files (skip repo directories)
|
|
3020
|
+
* 6. If new slot: copy template, materialize all repos, write metadata.json
|
|
3021
|
+
* 7. Return the slot (with path, index, isExisting)
|
|
3022
|
+
*/
|
|
3023
|
+
acquireWorkspace(options: AcquireWorkspaceOptions): Promise<PoolSlot>;
|
|
3024
|
+
/** Remove lock file to release a slot. */
|
|
3025
|
+
releaseSlot(slot: PoolSlot): Promise<void>;
|
|
3026
|
+
/**
|
|
3027
|
+
* Try to acquire a PID-based lock file.
|
|
3028
|
+
* On EEXIST, read PID and check if process is alive. If dead, stale lock — remove and retry.
|
|
3029
|
+
* Returns true if lock acquired, false if slot is actively locked.
|
|
3030
|
+
* Uses a bounded loop (max 3 attempts) to avoid unbounded recursion.
|
|
3031
|
+
*/
|
|
3032
|
+
private tryLock;
|
|
3033
|
+
/**
|
|
3034
|
+
* Check if the stored fingerprint in metadata.json differs from the computed one.
|
|
3035
|
+
* Returns true if drifted, false otherwise.
|
|
3036
|
+
* Returns false (no drift) if metadata.json doesn't exist (first use).
|
|
3037
|
+
*/
|
|
3038
|
+
private checkDrift;
|
|
3039
|
+
/** Write metadata.json with fingerprint, inputs, and timestamp. */
|
|
3040
|
+
private writeMetadata;
|
|
3041
|
+
/** Remove all slot directories and their lock files from a pool directory. */
|
|
3042
|
+
private removeAllSlots;
|
|
3043
|
+
/**
|
|
3044
|
+
* Reset an existing slot for reuse:
|
|
3045
|
+
* 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
|
|
3046
|
+
* 2. Re-copy template files (skip repo directories)
|
|
3047
|
+
*/
|
|
3048
|
+
private resetSlot;
|
|
3049
|
+
}
|
|
3050
|
+
|
|
2966
3051
|
/**
|
|
2967
3052
|
* File-based LLM response cache.
|
|
2968
3053
|
* Stores provider responses as JSON files keyed by SHA-256 hash.
|
|
@@ -3019,6 +3104,13 @@ declare function toSnakeCaseDeep(obj: unknown): unknown;
|
|
|
3019
3104
|
*/
|
|
3020
3105
|
declare function toCamelCaseDeep(obj: unknown): unknown;
|
|
3021
3106
|
|
|
3107
|
+
declare function getAgentvHome(): string;
|
|
3108
|
+
declare function getWorkspacesRoot(): string;
|
|
3109
|
+
declare function getGitCacheRoot(): string;
|
|
3110
|
+
declare function getSubagentsRoot(): string;
|
|
3111
|
+
declare function getTraceStateRoot(): string;
|
|
3112
|
+
declare function getWorkspacePoolRoot(): string;
|
|
3113
|
+
|
|
3022
3114
|
/**
|
|
3023
3115
|
* Trims an EvaluationResult for baseline storage.
|
|
3024
3116
|
* Strips large debug/audit fields (denylist approach) while preserving
|
|
@@ -3173,4 +3265,4 @@ type AgentKernel = {
|
|
|
3173
3265
|
};
|
|
3174
3266
|
declare function createAgentKernel(): AgentKernel;
|
|
3175
3267
|
|
|
3176
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
3268
|
+
export { type AcquireWorkspaceOptions, AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getGitCacheRoot, getHitCount, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -1198,6 +1198,8 @@ type ExecutionDefaults = {
|
|
|
1198
1198
|
readonly trace_file?: string;
|
|
1199
1199
|
readonly keep_workspaces?: boolean;
|
|
1200
1200
|
readonly otel_file?: string;
|
|
1201
|
+
readonly pool_workspaces?: boolean;
|
|
1202
|
+
readonly pool_slots?: number;
|
|
1201
1203
|
};
|
|
1202
1204
|
type AgentVConfig$1 = {
|
|
1203
1205
|
readonly required_version?: string;
|
|
@@ -2346,7 +2348,9 @@ declare class DeterministicAssertionEvaluator implements Evaluator {
|
|
|
2346
2348
|
|
|
2347
2349
|
declare class RepoManager {
|
|
2348
2350
|
private readonly cacheDir;
|
|
2349
|
-
|
|
2351
|
+
private readonly verbose;
|
|
2352
|
+
constructor(cacheDir?: string, verbose?: boolean);
|
|
2353
|
+
private runGit;
|
|
2350
2354
|
/**
|
|
2351
2355
|
* Ensure a bare mirror cache exists for the given source.
|
|
2352
2356
|
* Creates on first access, fetches updates on subsequent calls.
|
|
@@ -2457,6 +2461,12 @@ interface RunEvaluationOptions {
|
|
|
2457
2461
|
readonly totalBudgetUsd?: number;
|
|
2458
2462
|
/** Execution error tolerance: true halts on first error */
|
|
2459
2463
|
readonly failOnError?: FailOnError;
|
|
2464
|
+
/** Opt-in: reuse materialized workspaces across eval runs */
|
|
2465
|
+
readonly poolWorkspaces?: boolean;
|
|
2466
|
+
/** Maximum number of pool slots on disk (default: 10, max: 50) */
|
|
2467
|
+
readonly poolMaxSlots?: number;
|
|
2468
|
+
/** Pre-existing workspace directory to use directly (skips clone/copy/pool) */
|
|
2469
|
+
readonly workspace?: string;
|
|
2460
2470
|
}
|
|
2461
2471
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
2462
2472
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
@@ -2963,6 +2973,81 @@ interface ResolvedWorkspaceTemplate {
|
|
|
2963
2973
|
*/
|
|
2964
2974
|
declare function resolveWorkspaceTemplate(templatePath: string | undefined): Promise<ResolvedWorkspaceTemplate | undefined>;
|
|
2965
2975
|
|
|
2976
|
+
interface AcquireWorkspaceOptions {
|
|
2977
|
+
templatePath?: string;
|
|
2978
|
+
repos: readonly RepoConfig[];
|
|
2979
|
+
maxSlots: number;
|
|
2980
|
+
repoManager: RepoManager;
|
|
2981
|
+
}
|
|
2982
|
+
interface PoolSlot {
|
|
2983
|
+
readonly index: number;
|
|
2984
|
+
readonly path: string;
|
|
2985
|
+
readonly isExisting: boolean;
|
|
2986
|
+
readonly lockPath: string;
|
|
2987
|
+
readonly fingerprint: string;
|
|
2988
|
+
readonly poolDir: string;
|
|
2989
|
+
}
|
|
2990
|
+
/**
|
|
2991
|
+
* Compute a deterministic SHA-256 fingerprint for a workspace configuration.
|
|
2992
|
+
* The fingerprint captures template path and all repo configs in a canonical order.
|
|
2993
|
+
*/
|
|
2994
|
+
declare function computeWorkspaceFingerprint(templatePath: string | undefined | null, repos: readonly RepoConfig[]): string;
|
|
2995
|
+
/**
|
|
2996
|
+
* Pools entire workspaces (template files + git repos) for reuse across eval runs.
|
|
2997
|
+
*
|
|
2998
|
+
* Pool structure:
|
|
2999
|
+
* ```
|
|
3000
|
+
* {poolRoot}/
|
|
3001
|
+
* {fingerprint}/
|
|
3002
|
+
* metadata.json # fingerprint inputs, creation timestamp
|
|
3003
|
+
* slot-0/ # complete workspace (template files + repos)
|
|
3004
|
+
* slot-0.lock # PID-based lock file
|
|
3005
|
+
* slot-1/ # created on concurrent demand
|
|
3006
|
+
* slot-1.lock
|
|
3007
|
+
* ```
|
|
3008
|
+
*/
|
|
3009
|
+
declare class WorkspacePoolManager {
|
|
3010
|
+
private readonly poolRoot;
|
|
3011
|
+
constructor(poolRoot?: string);
|
|
3012
|
+
/**
|
|
3013
|
+
* Acquire a workspace slot from the pool.
|
|
3014
|
+
*
|
|
3015
|
+
* 1. Compute fingerprint from template + repos
|
|
3016
|
+
* 2. Check drift (compare stored metadata.json fingerprint vs computed)
|
|
3017
|
+
* 3. If drift: warn, remove all slots, rematerialize
|
|
3018
|
+
* 4. Acquire a slot (try-lock slot-0, slot-1, ..., up to maxSlots)
|
|
3019
|
+
* 5. If slot exists: reset repos, re-copy template files (skip repo directories)
|
|
3020
|
+
* 6. If new slot: copy template, materialize all repos, write metadata.json
|
|
3021
|
+
* 7. Return the slot (with path, index, isExisting)
|
|
3022
|
+
*/
|
|
3023
|
+
acquireWorkspace(options: AcquireWorkspaceOptions): Promise<PoolSlot>;
|
|
3024
|
+
/** Remove lock file to release a slot. */
|
|
3025
|
+
releaseSlot(slot: PoolSlot): Promise<void>;
|
|
3026
|
+
/**
|
|
3027
|
+
* Try to acquire a PID-based lock file.
|
|
3028
|
+
* On EEXIST, read PID and check if process is alive. If dead, stale lock — remove and retry.
|
|
3029
|
+
* Returns true if lock acquired, false if slot is actively locked.
|
|
3030
|
+
* Uses a bounded loop (max 3 attempts) to avoid unbounded recursion.
|
|
3031
|
+
*/
|
|
3032
|
+
private tryLock;
|
|
3033
|
+
/**
|
|
3034
|
+
* Check if the stored fingerprint in metadata.json differs from the computed one.
|
|
3035
|
+
* Returns true if drifted, false otherwise.
|
|
3036
|
+
* Returns false (no drift) if metadata.json doesn't exist (first use).
|
|
3037
|
+
*/
|
|
3038
|
+
private checkDrift;
|
|
3039
|
+
/** Write metadata.json with fingerprint, inputs, and timestamp. */
|
|
3040
|
+
private writeMetadata;
|
|
3041
|
+
/** Remove all slot directories and their lock files from a pool directory. */
|
|
3042
|
+
private removeAllSlots;
|
|
3043
|
+
/**
|
|
3044
|
+
* Reset an existing slot for reuse:
|
|
3045
|
+
* 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
|
|
3046
|
+
* 2. Re-copy template files (skip repo directories)
|
|
3047
|
+
*/
|
|
3048
|
+
private resetSlot;
|
|
3049
|
+
}
|
|
3050
|
+
|
|
2966
3051
|
/**
|
|
2967
3052
|
* File-based LLM response cache.
|
|
2968
3053
|
* Stores provider responses as JSON files keyed by SHA-256 hash.
|
|
@@ -3019,6 +3104,13 @@ declare function toSnakeCaseDeep(obj: unknown): unknown;
|
|
|
3019
3104
|
*/
|
|
3020
3105
|
declare function toCamelCaseDeep(obj: unknown): unknown;
|
|
3021
3106
|
|
|
3107
|
+
declare function getAgentvHome(): string;
|
|
3108
|
+
declare function getWorkspacesRoot(): string;
|
|
3109
|
+
declare function getGitCacheRoot(): string;
|
|
3110
|
+
declare function getSubagentsRoot(): string;
|
|
3111
|
+
declare function getTraceStateRoot(): string;
|
|
3112
|
+
declare function getWorkspacePoolRoot(): string;
|
|
3113
|
+
|
|
3022
3114
|
/**
|
|
3023
3115
|
* Trims an EvaluationResult for baseline storage.
|
|
3024
3116
|
* Strips large debug/audit fields (denylist approach) while preserving
|
|
@@ -3173,4 +3265,4 @@ type AgentKernel = {
|
|
|
3173
3265
|
};
|
|
3174
3266
|
declare function createAgentKernel(): AgentKernel;
|
|
3175
3267
|
|
|
3176
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
3268
|
+
export { type AcquireWorkspaceOptions, AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getGitCacheRoot, getHitCount, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|