@agentv/core 4.11.2 → 4.12.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -3855,63 +3855,63 @@ declare function getTraceStateRoot(): string;
3855
3855
  declare function getWorkspacePoolRoot(): string;
3856
3856
 
3857
3857
  /**
3858
- * Project registry for AgentV Studio multi-project support.
3858
+ * Benchmark registry for AgentV Studio multi-benchmark support.
3859
3859
  *
3860
- * A Project = any directory containing a `.agentv/` folder.
3861
- * The registry lives at `~/.agentv/projects.yaml` and tracks registered projects.
3860
+ * A Benchmark = any directory containing a `.agentv/` folder.
3861
+ * The registry lives at `~/.agentv/projects.yaml` and tracks registered benchmarks.
3862
3862
  *
3863
3863
  * YAML format:
3864
- * projects:
3864
+ * benchmarks:
3865
3865
  * - id: my-app
3866
3866
  * name: My App
3867
3867
  * path: /home/user/projects/my-app
3868
3868
  * addedAt: "2026-03-20T10:00:00Z"
3869
3869
  * lastOpenedAt: "2026-03-30T14:00:00Z"
3870
3870
  *
3871
- * To extend: use loadProjectRegistry() / saveProjectRegistry() for CRUD,
3872
- * discoverProjects() to scan a directory tree for `.agentv/` directories.
3871
+ * To extend: use loadBenchmarkRegistry() / saveBenchmarkRegistry() for CRUD,
3872
+ * discoverBenchmarks() to scan a directory tree for `.agentv/` directories.
3873
3873
  */
3874
- interface ProjectEntry {
3874
+ interface BenchmarkEntry {
3875
3875
  id: string;
3876
3876
  name: string;
3877
3877
  path: string;
3878
3878
  addedAt: string;
3879
3879
  lastOpenedAt: string;
3880
3880
  }
3881
- interface ProjectRegistry {
3882
- projects: ProjectEntry[];
3881
+ interface BenchmarkRegistry {
3882
+ benchmarks: BenchmarkEntry[];
3883
3883
  }
3884
- declare function getProjectsRegistryPath(): string;
3885
- declare function loadProjectRegistry(): ProjectRegistry;
3886
- declare function saveProjectRegistry(registry: ProjectRegistry): void;
3884
+ declare function getBenchmarksRegistryPath(): string;
3885
+ declare function loadBenchmarkRegistry(): BenchmarkRegistry;
3886
+ declare function saveBenchmarkRegistry(registry: BenchmarkRegistry): void;
3887
3887
  /**
3888
- * Derive a URL-safe project ID from a directory path.
3888
+ * Derive a URL-safe benchmark ID from a directory path.
3889
3889
  * Uses the directory basename, lowercased, with non-alphanumeric chars replaced by hyphens.
3890
3890
  * Appends a numeric suffix if the ID already exists in the registry.
3891
3891
  */
3892
- declare function deriveProjectId(dirPath: string, existingIds: string[]): string;
3892
+ declare function deriveBenchmarkId(dirPath: string, existingIds: string[]): string;
3893
3893
  /**
3894
- * Register a project by path. Returns the new entry, or the existing one if already registered.
3894
+ * Register a benchmark by path. Returns the new entry, or the existing one if already registered.
3895
3895
  * Validates that the path exists and contains a `.agentv/` directory.
3896
3896
  */
3897
- declare function addProject(projectPath: string): ProjectEntry;
3897
+ declare function addBenchmark(benchmarkPath: string): BenchmarkEntry;
3898
3898
  /**
3899
- * Remove a project by ID. Returns true if removed, false if not found.
3899
+ * Remove a benchmark by ID. Returns true if removed, false if not found.
3900
3900
  */
3901
- declare function removeProject(projectId: string): boolean;
3901
+ declare function removeBenchmark(benchmarkId: string): boolean;
3902
3902
  /**
3903
- * Look up a project by ID. Returns undefined if not found.
3903
+ * Look up a benchmark by ID. Returns undefined if not found.
3904
3904
  */
3905
- declare function getProject(projectId: string): ProjectEntry | undefined;
3905
+ declare function getBenchmark(benchmarkId: string): BenchmarkEntry | undefined;
3906
3906
  /**
3907
- * Update lastOpenedAt for a project.
3907
+ * Update lastOpenedAt for a benchmark.
3908
3908
  */
3909
- declare function touchProject(projectId: string): void;
3909
+ declare function touchBenchmark(benchmarkId: string): void;
3910
3910
  /**
3911
3911
  * Scan a directory tree (up to maxDepth levels) for directories containing `.agentv/`.
3912
- * Returns absolute paths of discovered project directories.
3912
+ * Returns absolute paths of discovered benchmark directories.
3913
3913
  */
3914
- declare function discoverProjects(rootDir: string, maxDepth?: number): string[];
3914
+ declare function discoverBenchmarks(rootDir: string, maxDepth?: number): string[];
3915
3915
 
3916
3916
  /**
3917
3917
  * Trims an EvaluationResult for baseline storage.
@@ -4372,4 +4372,4 @@ type AgentKernel = {
4372
4372
  };
4373
4373
  declare function createAgentKernel(): AgentKernel;
4374
4374
 
4375
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, directorySizeBytes, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4375
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
package/dist/index.d.ts CHANGED
@@ -3855,63 +3855,63 @@ declare function getTraceStateRoot(): string;
3855
3855
  declare function getWorkspacePoolRoot(): string;
3856
3856
 
3857
3857
  /**
3858
- * Project registry for AgentV Studio multi-project support.
3858
+ * Benchmark registry for AgentV Studio multi-benchmark support.
3859
3859
  *
3860
- * A Project = any directory containing a `.agentv/` folder.
3861
- * The registry lives at `~/.agentv/projects.yaml` and tracks registered projects.
3860
+ * A Benchmark = any directory containing a `.agentv/` folder.
3861
+ * The registry lives at `~/.agentv/projects.yaml` and tracks registered benchmarks.
3862
3862
  *
3863
3863
  * YAML format:
3864
- * projects:
3864
+ * benchmarks:
3865
3865
  * - id: my-app
3866
3866
  * name: My App
3867
3867
  * path: /home/user/projects/my-app
3868
3868
  * addedAt: "2026-03-20T10:00:00Z"
3869
3869
  * lastOpenedAt: "2026-03-30T14:00:00Z"
3870
3870
  *
3871
- * To extend: use loadProjectRegistry() / saveProjectRegistry() for CRUD,
3872
- * discoverProjects() to scan a directory tree for `.agentv/` directories.
3871
+ * To extend: use loadBenchmarkRegistry() / saveBenchmarkRegistry() for CRUD,
3872
+ * discoverBenchmarks() to scan a directory tree for `.agentv/` directories.
3873
3873
  */
3874
- interface ProjectEntry {
3874
+ interface BenchmarkEntry {
3875
3875
  id: string;
3876
3876
  name: string;
3877
3877
  path: string;
3878
3878
  addedAt: string;
3879
3879
  lastOpenedAt: string;
3880
3880
  }
3881
- interface ProjectRegistry {
3882
- projects: ProjectEntry[];
3881
+ interface BenchmarkRegistry {
3882
+ benchmarks: BenchmarkEntry[];
3883
3883
  }
3884
- declare function getProjectsRegistryPath(): string;
3885
- declare function loadProjectRegistry(): ProjectRegistry;
3886
- declare function saveProjectRegistry(registry: ProjectRegistry): void;
3884
+ declare function getBenchmarksRegistryPath(): string;
3885
+ declare function loadBenchmarkRegistry(): BenchmarkRegistry;
3886
+ declare function saveBenchmarkRegistry(registry: BenchmarkRegistry): void;
3887
3887
  /**
3888
- * Derive a URL-safe project ID from a directory path.
3888
+ * Derive a URL-safe benchmark ID from a directory path.
3889
3889
  * Uses the directory basename, lowercased, with non-alphanumeric chars replaced by hyphens.
3890
3890
  * Appends a numeric suffix if the ID already exists in the registry.
3891
3891
  */
3892
- declare function deriveProjectId(dirPath: string, existingIds: string[]): string;
3892
+ declare function deriveBenchmarkId(dirPath: string, existingIds: string[]): string;
3893
3893
  /**
3894
- * Register a project by path. Returns the new entry, or the existing one if already registered.
3894
+ * Register a benchmark by path. Returns the new entry, or the existing one if already registered.
3895
3895
  * Validates that the path exists and contains a `.agentv/` directory.
3896
3896
  */
3897
- declare function addProject(projectPath: string): ProjectEntry;
3897
+ declare function addBenchmark(benchmarkPath: string): BenchmarkEntry;
3898
3898
  /**
3899
- * Remove a project by ID. Returns true if removed, false if not found.
3899
+ * Remove a benchmark by ID. Returns true if removed, false if not found.
3900
3900
  */
3901
- declare function removeProject(projectId: string): boolean;
3901
+ declare function removeBenchmark(benchmarkId: string): boolean;
3902
3902
  /**
3903
- * Look up a project by ID. Returns undefined if not found.
3903
+ * Look up a benchmark by ID. Returns undefined if not found.
3904
3904
  */
3905
- declare function getProject(projectId: string): ProjectEntry | undefined;
3905
+ declare function getBenchmark(benchmarkId: string): BenchmarkEntry | undefined;
3906
3906
  /**
3907
- * Update lastOpenedAt for a project.
3907
+ * Update lastOpenedAt for a benchmark.
3908
3908
  */
3909
- declare function touchProject(projectId: string): void;
3909
+ declare function touchBenchmark(benchmarkId: string): void;
3910
3910
  /**
3911
3911
  * Scan a directory tree (up to maxDepth levels) for directories containing `.agentv/`.
3912
- * Returns absolute paths of discovered project directories.
3912
+ * Returns absolute paths of discovered benchmark directories.
3913
3913
  */
3914
- declare function discoverProjects(rootDir: string, maxDepth?: number): string[];
3914
+ declare function discoverBenchmarks(rootDir: string, maxDepth?: number): string[];
3915
3915
 
3916
3916
  /**
3917
3917
  * Trims an EvaluationResult for baseline storage.
@@ -4372,4 +4372,4 @@ type AgentKernel = {
4372
4372
  };
4373
4373
  declare function createAgentKernel(): AgentKernel;
4374
4374
 
4375
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, directorySizeBytes, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4375
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
package/dist/index.js CHANGED
@@ -19179,40 +19179,40 @@ async function createDraftResultsPr(params) {
19179
19179
  return stdout.trim();
19180
19180
  }
19181
19181
 
19182
- // src/projects.ts
19182
+ // src/benchmarks.ts
19183
19183
  import { existsSync as existsSync8, mkdirSync as mkdirSync3, readFileSync as readFileSync4, readdirSync as readdirSync3, statSync as statSync2, writeFileSync as writeFileSync2 } from "node:fs";
19184
19184
  import path50 from "node:path";
19185
19185
  import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
19186
- function getProjectsRegistryPath() {
19186
+ function getBenchmarksRegistryPath() {
19187
19187
  return path50.join(getAgentvHome(), "projects.yaml");
19188
19188
  }
19189
- function loadProjectRegistry() {
19190
- const registryPath = getProjectsRegistryPath();
19189
+ function loadBenchmarkRegistry() {
19190
+ const registryPath = getBenchmarksRegistryPath();
19191
19191
  if (!existsSync8(registryPath)) {
19192
- return { projects: [] };
19192
+ return { benchmarks: [] };
19193
19193
  }
19194
19194
  try {
19195
19195
  const raw = readFileSync4(registryPath, "utf-8");
19196
19196
  const parsed = parseYaml3(raw);
19197
- if (!parsed || !Array.isArray(parsed.projects)) {
19198
- return { projects: [] };
19197
+ if (!parsed || !Array.isArray(parsed.benchmarks)) {
19198
+ return { benchmarks: [] };
19199
19199
  }
19200
- return { projects: parsed.projects };
19200
+ return { benchmarks: parsed.benchmarks };
19201
19201
  } catch {
19202
- return { projects: [] };
19202
+ return { benchmarks: [] };
19203
19203
  }
19204
19204
  }
19205
- function saveProjectRegistry(registry) {
19206
- const registryPath = getProjectsRegistryPath();
19205
+ function saveBenchmarkRegistry(registry) {
19206
+ const registryPath = getBenchmarksRegistryPath();
19207
19207
  const dir = path50.dirname(registryPath);
19208
19208
  if (!existsSync8(dir)) {
19209
19209
  mkdirSync3(dir, { recursive: true });
19210
19210
  }
19211
- writeFileSync2(registryPath, stringifyYaml(registry), "utf-8");
19211
+ writeFileSync2(registryPath, stringifyYaml({ benchmarks: registry.benchmarks }), "utf-8");
19212
19212
  }
19213
- function deriveProjectId(dirPath, existingIds) {
19213
+ function deriveBenchmarkId(dirPath, existingIds) {
19214
19214
  const base = path50.basename(dirPath).toLowerCase().replace(/[^a-z0-9-]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "");
19215
- let candidate = base || "project";
19215
+ let candidate = base || "benchmark";
19216
19216
  let suffix = 2;
19217
19217
  while (existingIds.includes(candidate)) {
19218
19218
  candidate = `${base}-${suffix}`;
@@ -19220,54 +19220,54 @@ function deriveProjectId(dirPath, existingIds) {
19220
19220
  }
19221
19221
  return candidate;
19222
19222
  }
19223
- function addProject(projectPath) {
19224
- const absPath = path50.resolve(projectPath);
19223
+ function addBenchmark(benchmarkPath) {
19224
+ const absPath = path50.resolve(benchmarkPath);
19225
19225
  if (!existsSync8(absPath)) {
19226
19226
  throw new Error(`Directory not found: ${absPath}`);
19227
19227
  }
19228
19228
  if (!existsSync8(path50.join(absPath, ".agentv"))) {
19229
19229
  throw new Error(`No .agentv/ directory found in ${absPath}. Run an evaluation first.`);
19230
19230
  }
19231
- const registry = loadProjectRegistry();
19232
- const existing = registry.projects.find((p) => p.path === absPath);
19231
+ const registry = loadBenchmarkRegistry();
19232
+ const existing = registry.benchmarks.find((p) => p.path === absPath);
19233
19233
  if (existing) {
19234
19234
  return existing;
19235
19235
  }
19236
19236
  const now = (/* @__PURE__ */ new Date()).toISOString();
19237
19237
  const entry = {
19238
- id: deriveProjectId(
19238
+ id: deriveBenchmarkId(
19239
19239
  absPath,
19240
- registry.projects.map((p) => p.id)
19240
+ registry.benchmarks.map((p) => p.id)
19241
19241
  ),
19242
19242
  name: path50.basename(absPath),
19243
19243
  path: absPath,
19244
19244
  addedAt: now,
19245
19245
  lastOpenedAt: now
19246
19246
  };
19247
- registry.projects.push(entry);
19248
- saveProjectRegistry(registry);
19247
+ registry.benchmarks.push(entry);
19248
+ saveBenchmarkRegistry(registry);
19249
19249
  return entry;
19250
19250
  }
19251
- function removeProject(projectId) {
19252
- const registry = loadProjectRegistry();
19253
- const idx = registry.projects.findIndex((p) => p.id === projectId);
19251
+ function removeBenchmark(benchmarkId) {
19252
+ const registry = loadBenchmarkRegistry();
19253
+ const idx = registry.benchmarks.findIndex((p) => p.id === benchmarkId);
19254
19254
  if (idx < 0) return false;
19255
- registry.projects.splice(idx, 1);
19256
- saveProjectRegistry(registry);
19255
+ registry.benchmarks.splice(idx, 1);
19256
+ saveBenchmarkRegistry(registry);
19257
19257
  return true;
19258
19258
  }
19259
- function getProject(projectId) {
19260
- return loadProjectRegistry().projects.find((p) => p.id === projectId);
19259
+ function getBenchmark(benchmarkId) {
19260
+ return loadBenchmarkRegistry().benchmarks.find((p) => p.id === benchmarkId);
19261
19261
  }
19262
- function touchProject(projectId) {
19263
- const registry = loadProjectRegistry();
19264
- const entry = registry.projects.find((p) => p.id === projectId);
19262
+ function touchBenchmark(benchmarkId) {
19263
+ const registry = loadBenchmarkRegistry();
19264
+ const entry = registry.benchmarks.find((p) => p.id === benchmarkId);
19265
19265
  if (entry) {
19266
19266
  entry.lastOpenedAt = (/* @__PURE__ */ new Date()).toISOString();
19267
- saveProjectRegistry(registry);
19267
+ saveBenchmarkRegistry(registry);
19268
19268
  }
19269
19269
  }
19270
- function discoverProjects(rootDir, maxDepth = 2) {
19270
+ function discoverBenchmarks(rootDir, maxDepth = 2) {
19271
19271
  const absRoot = path50.resolve(rootDir);
19272
19272
  if (!existsSync8(absRoot) || !statSync2(absRoot).isDirectory()) {
19273
19273
  return [];
@@ -20434,7 +20434,7 @@ export {
20434
20434
  TranscriptProvider,
20435
20435
  WorkspaceCreationError,
20436
20436
  WorkspacePoolManager,
20437
- addProject,
20437
+ addBenchmark,
20438
20438
  assembleLlmGraderPrompt,
20439
20439
  assembleLlmGraderPrompt as assembleLlmJudgePrompt,
20440
20440
  avgToolDurationMs,
@@ -20466,17 +20466,17 @@ export {
20466
20466
  createTempWorkspace,
20467
20467
  deepEqual,
20468
20468
  defineConfig,
20469
+ deriveBenchmarkId,
20469
20470
  deriveCategory,
20470
- deriveProjectId,
20471
20471
  detectFormat,
20472
20472
  directorySizeBytes,
20473
20473
  discoverAssertions,
20474
+ discoverBenchmarks,
20474
20475
  discoverClaudeSessions,
20475
20476
  discoverCodexSessions,
20476
20477
  discoverCopilotSessions,
20477
20478
  discoverGraders,
20478
20479
  discoverGraders as discoverJudges,
20479
- discoverProjects,
20480
20480
  discoverProviders,
20481
20481
  ensureResultsRepoClone,
20482
20482
  ensureVSCodeSubagents,
@@ -20500,9 +20500,9 @@ export {
20500
20500
  freeformEvaluationSchema,
20501
20501
  generateRubrics,
20502
20502
  getAgentvHome,
20503
+ getBenchmark,
20504
+ getBenchmarksRegistryPath,
20503
20505
  getOutputFilenames,
20504
- getProject,
20505
- getProjectsRegistryPath,
20506
20506
  getResultsRepoCachePaths,
20507
20507
  getResultsRepoStatus,
20508
20508
  getSubagentsRoot,
@@ -20522,11 +20522,11 @@ export {
20522
20522
  isTestMessage,
20523
20523
  isTestMessageRole,
20524
20524
  listTargetNames,
20525
+ loadBenchmarkRegistry,
20525
20526
  loadConfig,
20526
20527
  loadEvalCaseById,
20527
20528
  loadEvalCases,
20528
20529
  loadEvalSuite,
20529
- loadProjectRegistry,
20530
20530
  loadTestById,
20531
20531
  loadTestSuite,
20532
20532
  loadTests,
@@ -20549,7 +20549,7 @@ export {
20549
20549
  readTextFile,
20550
20550
  readTranscriptFile,
20551
20551
  readTranscriptJsonl,
20552
- removeProject,
20552
+ removeBenchmark,
20553
20553
  resolveAndCreateProvider,
20554
20554
  resolveDelegatedTargetDefinition,
20555
20555
  resolveFileReference,
@@ -20571,7 +20571,7 @@ export {
20571
20571
  runIsJsonAssertion,
20572
20572
  runRegexAssertion,
20573
20573
  runStartsWithAssertion,
20574
- saveProjectRegistry,
20574
+ saveBenchmarkRegistry,
20575
20575
  scanRepoDeps,
20576
20576
  scoreToVerdict,
20577
20577
  shouldEnableCache,
@@ -20588,7 +20588,7 @@ export {
20588
20588
  toSnakeCaseDeep,
20589
20589
  toTranscriptJsonLine,
20590
20590
  tokensPerTool,
20591
- touchProject,
20591
+ touchBenchmark,
20592
20592
  transpileEvalYaml,
20593
20593
  transpileEvalYamlFile,
20594
20594
  trimBaselineResult