@agentv/core 4.28.0 → 4.29.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -4257,118 +4257,127 @@ declare function getTraceStateRoot(): string;
4257
4257
  declare function getWorkspacePoolRoot(): string;
4258
4258
 
4259
4259
  /**
4260
- * Benchmark registry for AgentV Studio multi-benchmark support.
4260
+ * Project registry for AgentV Studio multi-project support.
4261
4261
  *
4262
- * A Benchmark = any directory containing a `.agentv/` folder.
4263
- * The registry lives at `~/.agentv/benchmarks.yaml` and is the single source of
4264
- * truth for which benchmarks Studio shows. Studio re-reads the file on every
4265
- * `/api/benchmarks` request, so edits (direct, via POST /api/benchmarks, via
4262
+ * A Project = any directory containing a `.agentv/` folder. Projects hold
4263
+ * eval runs, and (incrementally) traces, spans, and other telemetry
4264
+ * matching the "project" terminology used by Arize Phoenix, Langfuse,
4265
+ * Braintrust, W&B Weave, and LangSmith.
4266
+ *
4267
+ * The registry lives at `~/.agentv/projects.yaml` and is the single source
4268
+ * of truth for which projects Studio shows. Studio re-reads the file on every
4269
+ * `/api/projects` request, so edits (direct, via POST /api/projects, via
4266
4270
  * the CLI's --add/--remove, or via a Kubernetes ConfigMap mount) are reflected
4267
4271
  * without restarting `agentv serve`.
4268
4272
  *
4269
4273
  * YAML format (all keys snake_case per AGENTS.md §"Wire Format Convention"):
4270
- * benchmarks:
4274
+ * projects:
4271
4275
  * - id: my-app
4272
4276
  * name: My App
4273
4277
  * path: /home/user/projects/my-app
4274
4278
  * source:
4275
- * url: ${{ BENCHMARK_REPO_URL }}
4276
- * ref: ${{ BENCHMARK_REPO_REF:-main }}
4279
+ * url: ${{ PROJECT_REPO_URL }}
4280
+ * ref: ${{ PROJECT_REPO_REF:-main }}
4277
4281
  * added_at: "2026-03-20T10:00:00Z"
4278
4282
  * last_opened_at: "2026-03-30T14:00:00Z"
4279
4283
  *
4280
- * The optional `source` field enables remote sync via syncBenchmarks():
4284
+ * The optional `source` field enables remote sync via syncProjects():
4281
4285
  * first run — git clone --depth 1 --filter=blob:none
4282
4286
  * subsequent runs — git pull --ff-only
4283
4287
  *
4284
4288
  * Concurrency: the registry assumes a single writer. All mutating calls
4285
- * (add/remove/touchBenchmark) do read-modify-write on benchmarks.yaml
4289
+ * (add/remove/touchProject) do read-modify-write on projects.yaml
4286
4290
  * without a lock. Studio's HTTP handlers are serialized by Node's
4287
4291
  * single-threaded event loop, which satisfies the 24/7 deployment case.
4288
4292
  * Run only one `agentv` process against a given home at a time.
4289
4293
  *
4294
+ * Legacy registry filename: the registry used to be called `benchmarks.yaml`
4295
+ * with a top-level `benchmarks:` key. On first load, a one-time migration
4296
+ * detects the old file, rewrites the top-level key to `projects:`, and
4297
+ * atomically renames the file. See migrateLegacyBenchmarksFile() below.
4298
+ *
4290
4299
  * To extend:
4291
- * - CRUD: loadBenchmarkRegistry() / saveBenchmarkRegistry() + the
4300
+ * - CRUD: loadProjectRegistry() / saveProjectRegistry() + the
4292
4301
  * add/remove/touch helpers.
4293
- * - discoverBenchmarks() is a one-shot filesystem utility for bulk
4302
+ * - discoverProjects() is a one-shot filesystem utility for bulk
4294
4303
  * registration; it does not run in the request path.
4295
4304
  */
4296
- interface BenchmarkSource {
4305
+ interface ProjectSource {
4297
4306
  url: string;
4298
4307
  ref: string;
4299
4308
  }
4300
- interface BenchmarkEntry {
4309
+ interface ProjectEntry {
4301
4310
  id: string;
4302
4311
  name: string;
4303
4312
  path: string;
4304
4313
  addedAt: string;
4305
4314
  lastOpenedAt: string;
4306
- source?: BenchmarkSource;
4315
+ source?: ProjectSource;
4307
4316
  }
4308
- interface BenchmarkRegistry {
4309
- benchmarks: BenchmarkEntry[];
4317
+ interface ProjectRegistry {
4318
+ projects: ProjectEntry[];
4310
4319
  }
4311
- declare function getBenchmarksRegistryPath(): string;
4312
- declare function loadBenchmarkRegistry(): BenchmarkRegistry;
4313
- declare function saveBenchmarkRegistry(registry: BenchmarkRegistry): void;
4320
+ declare function getProjectsRegistryPath(): string;
4321
+ declare function loadProjectRegistry(): ProjectRegistry;
4322
+ declare function saveProjectRegistry(registry: ProjectRegistry): void;
4314
4323
  /**
4315
- * Derive a URL-safe benchmark ID from a directory path.
4324
+ * Derive a URL-safe project ID from a directory path.
4316
4325
  * Uses the directory basename, lowercased, with non-alphanumeric chars replaced by hyphens.
4317
4326
  * Appends a numeric suffix if the ID already exists in the registry.
4318
4327
  */
4319
- declare function deriveBenchmarkId(dirPath: string, existingIds: string[]): string;
4328
+ declare function deriveProjectId(dirPath: string, existingIds: string[]): string;
4320
4329
  /**
4321
- * Register a benchmark by path. Returns the new entry, or the existing one if already registered.
4330
+ * Register a project by path. Returns the new entry, or the existing one if already registered.
4322
4331
  * Validates that the path exists and contains a `.agentv/` directory.
4323
4332
  */
4324
- declare function addBenchmark(benchmarkPath: string): BenchmarkEntry;
4333
+ declare function addProject(projectPath: string): ProjectEntry;
4325
4334
  /**
4326
- * Remove a benchmark by ID. Returns true if removed, false if not found.
4335
+ * Remove a project by ID. Returns true if removed, false if not found.
4327
4336
  */
4328
- declare function removeBenchmark(benchmarkId: string): boolean;
4337
+ declare function removeProject(projectId: string): boolean;
4329
4338
  /**
4330
- * Look up a benchmark by ID. Returns undefined if not found.
4339
+ * Look up a project by ID. Returns undefined if not found.
4331
4340
  */
4332
- declare function getBenchmark(benchmarkId: string): BenchmarkEntry | undefined;
4341
+ declare function getProject(projectId: string): ProjectEntry | undefined;
4333
4342
  /**
4334
- * Update lastOpenedAt for a benchmark.
4343
+ * Update lastOpenedAt for a project.
4335
4344
  */
4336
- declare function touchBenchmark(benchmarkId: string): void;
4345
+ declare function touchProject(projectId: string): void;
4337
4346
  /**
4338
4347
  * Scan a directory tree (up to maxDepth levels) for directories containing `.agentv/`.
4339
- * Returns absolute paths of discovered benchmark directories, sorted for
4348
+ * Returns absolute paths of discovered project directories, sorted for
4340
4349
  * deterministic iteration. This is a one-shot helper for bulk registration;
4341
4350
  * Studio does not scan at request time.
4342
4351
  */
4343
- declare function discoverBenchmarks(rootDir: string, maxDepth?: number): string[];
4352
+ declare function discoverProjects(rootDir: string, maxDepth?: number): string[];
4344
4353
 
4345
4354
  /**
4346
- * Benchmark sync — pulls remote git repos to the local path declared in the
4347
- * benchmark registry before Studio/eval startup.
4355
+ * Project sync — pulls remote git repos to the local path declared in the
4356
+ * project registry before Studio/eval startup.
4348
4357
  *
4349
4358
  * Sync is oneshot only, triggered by the Studio UI "Sync" button or the
4350
- * `agentv benchmark sync` CLI command. There is no daemon or continuous mode.
4359
+ * `agentv project sync` CLI command. There is no daemon or continuous mode.
4351
4360
  *
4352
4361
  * First run — git clone --depth 1 --filter=blob:none --branch <ref> <url> <path>
4353
4362
  * Subsequent — git pull --ff-only (when <path>/.git already exists)
4354
4363
  *
4355
4364
  * Usage:
4356
- * import { syncBenchmarks } from './benchmark-sync.js';
4357
- * await syncBenchmarks(registry.benchmarks);
4365
+ * import { syncProjects } from './project-sync.js';
4366
+ * await syncProjects(registry.projects);
4358
4367
  */
4359
4368
 
4360
4369
  /**
4361
- * Clone or pull a single benchmark entry from its declared source.
4370
+ * Clone or pull a single project entry from its declared source.
4362
4371
  * - No .git present: shallow clone into entry.path.
4363
4372
  * - .git present: git pull --ff-only to update in place.
4364
4373
  * Throws on git error or missing source.
4365
4374
  */
4366
- declare function syncBenchmark(entry: BenchmarkEntry): Promise<void>;
4375
+ declare function syncProject(entry: ProjectEntry): Promise<void>;
4367
4376
  /**
4368
- * Iterate benchmark entries and sync any that have a source declared.
4377
+ * Iterate project entries and sync any that have a source declared.
4369
4378
  * Entries without source are skipped silently.
4370
4379
  */
4371
- declare function syncBenchmarks(entries: BenchmarkEntry[]): Promise<void>;
4380
+ declare function syncProjects(entries: ProjectEntry[]): Promise<void>;
4372
4381
 
4373
4382
  /**
4374
4383
  * Trims an EvaluationResult for baseline storage.
@@ -4942,4 +4951,4 @@ type AgentKernel = {
4942
4951
  };
4943
4952
  declare function createAgentKernel(): AgentKernel;
4944
4953
 
4945
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, type BenchmarkSource, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceEnvConfig, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directPushResults, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, killAllTrackedChildren, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, parseYamlValue, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncBenchmark, syncBenchmarks, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, trackChild, trackedChildCount, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4954
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type ProjectSource, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceEnvConfig, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, directPushResults, directorySizeBytes, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProjects, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, killAllTrackedChildren, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, parseYamlValue, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncProject, syncProjects, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchProject, trackChild, trackedChildCount, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
package/dist/index.d.ts CHANGED
@@ -4257,118 +4257,127 @@ declare function getTraceStateRoot(): string;
4257
4257
  declare function getWorkspacePoolRoot(): string;
4258
4258
 
4259
4259
  /**
4260
- * Benchmark registry for AgentV Studio multi-benchmark support.
4260
+ * Project registry for AgentV Studio multi-project support.
4261
4261
  *
4262
- * A Benchmark = any directory containing a `.agentv/` folder.
4263
- * The registry lives at `~/.agentv/benchmarks.yaml` and is the single source of
4264
- * truth for which benchmarks Studio shows. Studio re-reads the file on every
4265
- * `/api/benchmarks` request, so edits (direct, via POST /api/benchmarks, via
4262
+ * A Project = any directory containing a `.agentv/` folder. Projects hold
4263
+ * eval runs, and (incrementally) traces, spans, and other telemetry
4264
+ * matching the "project" terminology used by Arize Phoenix, Langfuse,
4265
+ * Braintrust, W&B Weave, and LangSmith.
4266
+ *
4267
+ * The registry lives at `~/.agentv/projects.yaml` and is the single source
4268
+ * of truth for which projects Studio shows. Studio re-reads the file on every
4269
+ * `/api/projects` request, so edits (direct, via POST /api/projects, via
4266
4270
  * the CLI's --add/--remove, or via a Kubernetes ConfigMap mount) are reflected
4267
4271
  * without restarting `agentv serve`.
4268
4272
  *
4269
4273
  * YAML format (all keys snake_case per AGENTS.md §"Wire Format Convention"):
4270
- * benchmarks:
4274
+ * projects:
4271
4275
  * - id: my-app
4272
4276
  * name: My App
4273
4277
  * path: /home/user/projects/my-app
4274
4278
  * source:
4275
- * url: ${{ BENCHMARK_REPO_URL }}
4276
- * ref: ${{ BENCHMARK_REPO_REF:-main }}
4279
+ * url: ${{ PROJECT_REPO_URL }}
4280
+ * ref: ${{ PROJECT_REPO_REF:-main }}
4277
4281
  * added_at: "2026-03-20T10:00:00Z"
4278
4282
  * last_opened_at: "2026-03-30T14:00:00Z"
4279
4283
  *
4280
- * The optional `source` field enables remote sync via syncBenchmarks():
4284
+ * The optional `source` field enables remote sync via syncProjects():
4281
4285
  * first run — git clone --depth 1 --filter=blob:none
4282
4286
  * subsequent runs — git pull --ff-only
4283
4287
  *
4284
4288
  * Concurrency: the registry assumes a single writer. All mutating calls
4285
- * (add/remove/touchBenchmark) do read-modify-write on benchmarks.yaml
4289
+ * (add/remove/touchProject) do read-modify-write on projects.yaml
4286
4290
  * without a lock. Studio's HTTP handlers are serialized by Node's
4287
4291
  * single-threaded event loop, which satisfies the 24/7 deployment case.
4288
4292
  * Run only one `agentv` process against a given home at a time.
4289
4293
  *
4294
+ * Legacy registry filename: the registry used to be called `benchmarks.yaml`
4295
+ * with a top-level `benchmarks:` key. On first load, a one-time migration
4296
+ * detects the old file, rewrites the top-level key to `projects:`, and
4297
+ * atomically renames the file. See migrateLegacyBenchmarksFile() below.
4298
+ *
4290
4299
  * To extend:
4291
- * - CRUD: loadBenchmarkRegistry() / saveBenchmarkRegistry() + the
4300
+ * - CRUD: loadProjectRegistry() / saveProjectRegistry() + the
4292
4301
  * add/remove/touch helpers.
4293
- * - discoverBenchmarks() is a one-shot filesystem utility for bulk
4302
+ * - discoverProjects() is a one-shot filesystem utility for bulk
4294
4303
  * registration; it does not run in the request path.
4295
4304
  */
4296
- interface BenchmarkSource {
4305
+ interface ProjectSource {
4297
4306
  url: string;
4298
4307
  ref: string;
4299
4308
  }
4300
- interface BenchmarkEntry {
4309
+ interface ProjectEntry {
4301
4310
  id: string;
4302
4311
  name: string;
4303
4312
  path: string;
4304
4313
  addedAt: string;
4305
4314
  lastOpenedAt: string;
4306
- source?: BenchmarkSource;
4315
+ source?: ProjectSource;
4307
4316
  }
4308
- interface BenchmarkRegistry {
4309
- benchmarks: BenchmarkEntry[];
4317
+ interface ProjectRegistry {
4318
+ projects: ProjectEntry[];
4310
4319
  }
4311
- declare function getBenchmarksRegistryPath(): string;
4312
- declare function loadBenchmarkRegistry(): BenchmarkRegistry;
4313
- declare function saveBenchmarkRegistry(registry: BenchmarkRegistry): void;
4320
+ declare function getProjectsRegistryPath(): string;
4321
+ declare function loadProjectRegistry(): ProjectRegistry;
4322
+ declare function saveProjectRegistry(registry: ProjectRegistry): void;
4314
4323
  /**
4315
- * Derive a URL-safe benchmark ID from a directory path.
4324
+ * Derive a URL-safe project ID from a directory path.
4316
4325
  * Uses the directory basename, lowercased, with non-alphanumeric chars replaced by hyphens.
4317
4326
  * Appends a numeric suffix if the ID already exists in the registry.
4318
4327
  */
4319
- declare function deriveBenchmarkId(dirPath: string, existingIds: string[]): string;
4328
+ declare function deriveProjectId(dirPath: string, existingIds: string[]): string;
4320
4329
  /**
4321
- * Register a benchmark by path. Returns the new entry, or the existing one if already registered.
4330
+ * Register a project by path. Returns the new entry, or the existing one if already registered.
4322
4331
  * Validates that the path exists and contains a `.agentv/` directory.
4323
4332
  */
4324
- declare function addBenchmark(benchmarkPath: string): BenchmarkEntry;
4333
+ declare function addProject(projectPath: string): ProjectEntry;
4325
4334
  /**
4326
- * Remove a benchmark by ID. Returns true if removed, false if not found.
4335
+ * Remove a project by ID. Returns true if removed, false if not found.
4327
4336
  */
4328
- declare function removeBenchmark(benchmarkId: string): boolean;
4337
+ declare function removeProject(projectId: string): boolean;
4329
4338
  /**
4330
- * Look up a benchmark by ID. Returns undefined if not found.
4339
+ * Look up a project by ID. Returns undefined if not found.
4331
4340
  */
4332
- declare function getBenchmark(benchmarkId: string): BenchmarkEntry | undefined;
4341
+ declare function getProject(projectId: string): ProjectEntry | undefined;
4333
4342
  /**
4334
- * Update lastOpenedAt for a benchmark.
4343
+ * Update lastOpenedAt for a project.
4335
4344
  */
4336
- declare function touchBenchmark(benchmarkId: string): void;
4345
+ declare function touchProject(projectId: string): void;
4337
4346
  /**
4338
4347
  * Scan a directory tree (up to maxDepth levels) for directories containing `.agentv/`.
4339
- * Returns absolute paths of discovered benchmark directories, sorted for
4348
+ * Returns absolute paths of discovered project directories, sorted for
4340
4349
  * deterministic iteration. This is a one-shot helper for bulk registration;
4341
4350
  * Studio does not scan at request time.
4342
4351
  */
4343
- declare function discoverBenchmarks(rootDir: string, maxDepth?: number): string[];
4352
+ declare function discoverProjects(rootDir: string, maxDepth?: number): string[];
4344
4353
 
4345
4354
  /**
4346
- * Benchmark sync — pulls remote git repos to the local path declared in the
4347
- * benchmark registry before Studio/eval startup.
4355
+ * Project sync — pulls remote git repos to the local path declared in the
4356
+ * project registry before Studio/eval startup.
4348
4357
  *
4349
4358
  * Sync is oneshot only, triggered by the Studio UI "Sync" button or the
4350
- * `agentv benchmark sync` CLI command. There is no daemon or continuous mode.
4359
+ * `agentv project sync` CLI command. There is no daemon or continuous mode.
4351
4360
  *
4352
4361
  * First run — git clone --depth 1 --filter=blob:none --branch <ref> <url> <path>
4353
4362
  * Subsequent — git pull --ff-only (when <path>/.git already exists)
4354
4363
  *
4355
4364
  * Usage:
4356
- * import { syncBenchmarks } from './benchmark-sync.js';
4357
- * await syncBenchmarks(registry.benchmarks);
4365
+ * import { syncProjects } from './project-sync.js';
4366
+ * await syncProjects(registry.projects);
4358
4367
  */
4359
4368
 
4360
4369
  /**
4361
- * Clone or pull a single benchmark entry from its declared source.
4370
+ * Clone or pull a single project entry from its declared source.
4362
4371
  * - No .git present: shallow clone into entry.path.
4363
4372
  * - .git present: git pull --ff-only to update in place.
4364
4373
  * Throws on git error or missing source.
4365
4374
  */
4366
- declare function syncBenchmark(entry: BenchmarkEntry): Promise<void>;
4375
+ declare function syncProject(entry: ProjectEntry): Promise<void>;
4367
4376
  /**
4368
- * Iterate benchmark entries and sync any that have a source declared.
4377
+ * Iterate project entries and sync any that have a source declared.
4369
4378
  * Entries without source are skipped silently.
4370
4379
  */
4371
- declare function syncBenchmarks(entries: BenchmarkEntry[]): Promise<void>;
4380
+ declare function syncProjects(entries: ProjectEntry[]): Promise<void>;
4372
4381
 
4373
4382
  /**
4374
4383
  * Trims an EvaluationResult for baseline storage.
@@ -4942,4 +4951,4 @@ type AgentKernel = {
4942
4951
  };
4943
4952
  declare function createAgentKernel(): AgentKernel;
4944
4953
 
4945
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, type BenchmarkSource, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceEnvConfig, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directPushResults, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, killAllTrackedChildren, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, parseYamlValue, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncBenchmark, syncBenchmarks, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, trackChild, trackedChildCount, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4954
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type ProjectSource, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceEnvConfig, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, directPushResults, directorySizeBytes, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProjects, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, killAllTrackedChildren, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, parseYamlValue, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncProject, syncProjects, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchProject, trackChild, trackedChildCount, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };