@agentv/core 4.11.2 → 4.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +52 -52
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +25 -25
- package/dist/index.d.ts +25 -25
- package/dist/index.js +43 -43
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -3855,63 +3855,63 @@ declare function getTraceStateRoot(): string;
|
|
|
3855
3855
|
declare function getWorkspacePoolRoot(): string;
|
|
3856
3856
|
|
|
3857
3857
|
/**
|
|
3858
|
-
*
|
|
3858
|
+
* Benchmark registry for AgentV Studio multi-benchmark support.
|
|
3859
3859
|
*
|
|
3860
|
-
* A
|
|
3861
|
-
* The registry lives at `~/.agentv/projects.yaml` and tracks registered
|
|
3860
|
+
* A Benchmark = any directory containing a `.agentv/` folder.
|
|
3861
|
+
* The registry lives at `~/.agentv/projects.yaml` and tracks registered benchmarks.
|
|
3862
3862
|
*
|
|
3863
3863
|
* YAML format:
|
|
3864
|
-
*
|
|
3864
|
+
* benchmarks:
|
|
3865
3865
|
* - id: my-app
|
|
3866
3866
|
* name: My App
|
|
3867
3867
|
* path: /home/user/projects/my-app
|
|
3868
3868
|
* addedAt: "2026-03-20T10:00:00Z"
|
|
3869
3869
|
* lastOpenedAt: "2026-03-30T14:00:00Z"
|
|
3870
3870
|
*
|
|
3871
|
-
* To extend: use
|
|
3872
|
-
*
|
|
3871
|
+
* To extend: use loadBenchmarkRegistry() / saveBenchmarkRegistry() for CRUD,
|
|
3872
|
+
* discoverBenchmarks() to scan a directory tree for `.agentv/` directories.
|
|
3873
3873
|
*/
|
|
3874
|
-
interface
|
|
3874
|
+
interface BenchmarkEntry {
|
|
3875
3875
|
id: string;
|
|
3876
3876
|
name: string;
|
|
3877
3877
|
path: string;
|
|
3878
3878
|
addedAt: string;
|
|
3879
3879
|
lastOpenedAt: string;
|
|
3880
3880
|
}
|
|
3881
|
-
interface
|
|
3882
|
-
|
|
3881
|
+
interface BenchmarkRegistry {
|
|
3882
|
+
benchmarks: BenchmarkEntry[];
|
|
3883
3883
|
}
|
|
3884
|
-
declare function
|
|
3885
|
-
declare function
|
|
3886
|
-
declare function
|
|
3884
|
+
declare function getBenchmarksRegistryPath(): string;
|
|
3885
|
+
declare function loadBenchmarkRegistry(): BenchmarkRegistry;
|
|
3886
|
+
declare function saveBenchmarkRegistry(registry: BenchmarkRegistry): void;
|
|
3887
3887
|
/**
|
|
3888
|
-
* Derive a URL-safe
|
|
3888
|
+
* Derive a URL-safe benchmark ID from a directory path.
|
|
3889
3889
|
* Uses the directory basename, lowercased, with non-alphanumeric chars replaced by hyphens.
|
|
3890
3890
|
* Appends a numeric suffix if the ID already exists in the registry.
|
|
3891
3891
|
*/
|
|
3892
|
-
declare function
|
|
3892
|
+
declare function deriveBenchmarkId(dirPath: string, existingIds: string[]): string;
|
|
3893
3893
|
/**
|
|
3894
|
-
* Register a
|
|
3894
|
+
* Register a benchmark by path. Returns the new entry, or the existing one if already registered.
|
|
3895
3895
|
* Validates that the path exists and contains a `.agentv/` directory.
|
|
3896
3896
|
*/
|
|
3897
|
-
declare function
|
|
3897
|
+
declare function addBenchmark(benchmarkPath: string): BenchmarkEntry;
|
|
3898
3898
|
/**
|
|
3899
|
-
* Remove a
|
|
3899
|
+
* Remove a benchmark by ID. Returns true if removed, false if not found.
|
|
3900
3900
|
*/
|
|
3901
|
-
declare function
|
|
3901
|
+
declare function removeBenchmark(benchmarkId: string): boolean;
|
|
3902
3902
|
/**
|
|
3903
|
-
* Look up a
|
|
3903
|
+
* Look up a benchmark by ID. Returns undefined if not found.
|
|
3904
3904
|
*/
|
|
3905
|
-
declare function
|
|
3905
|
+
declare function getBenchmark(benchmarkId: string): BenchmarkEntry | undefined;
|
|
3906
3906
|
/**
|
|
3907
|
-
* Update lastOpenedAt for a
|
|
3907
|
+
* Update lastOpenedAt for a benchmark.
|
|
3908
3908
|
*/
|
|
3909
|
-
declare function
|
|
3909
|
+
declare function touchBenchmark(benchmarkId: string): void;
|
|
3910
3910
|
/**
|
|
3911
3911
|
* Scan a directory tree (up to maxDepth levels) for directories containing `.agentv/`.
|
|
3912
|
-
* Returns absolute paths of discovered
|
|
3912
|
+
* Returns absolute paths of discovered benchmark directories.
|
|
3913
3913
|
*/
|
|
3914
|
-
declare function
|
|
3914
|
+
declare function discoverBenchmarks(rootDir: string, maxDepth?: number): string[];
|
|
3915
3915
|
|
|
3916
3916
|
/**
|
|
3917
3917
|
* Trims an EvaluationResult for baseline storage.
|
|
@@ -4372,4 +4372,4 @@ type AgentKernel = {
|
|
|
4372
4372
|
};
|
|
4373
4373
|
declare function createAgentKernel(): AgentKernel;
|
|
4374
4374
|
|
|
4375
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type
|
|
4375
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -3855,63 +3855,63 @@ declare function getTraceStateRoot(): string;
|
|
|
3855
3855
|
declare function getWorkspacePoolRoot(): string;
|
|
3856
3856
|
|
|
3857
3857
|
/**
|
|
3858
|
-
*
|
|
3858
|
+
* Benchmark registry for AgentV Studio multi-benchmark support.
|
|
3859
3859
|
*
|
|
3860
|
-
* A
|
|
3861
|
-
* The registry lives at `~/.agentv/projects.yaml` and tracks registered
|
|
3860
|
+
* A Benchmark = any directory containing a `.agentv/` folder.
|
|
3861
|
+
* The registry lives at `~/.agentv/projects.yaml` and tracks registered benchmarks.
|
|
3862
3862
|
*
|
|
3863
3863
|
* YAML format:
|
|
3864
|
-
*
|
|
3864
|
+
* benchmarks:
|
|
3865
3865
|
* - id: my-app
|
|
3866
3866
|
* name: My App
|
|
3867
3867
|
* path: /home/user/projects/my-app
|
|
3868
3868
|
* addedAt: "2026-03-20T10:00:00Z"
|
|
3869
3869
|
* lastOpenedAt: "2026-03-30T14:00:00Z"
|
|
3870
3870
|
*
|
|
3871
|
-
* To extend: use
|
|
3872
|
-
*
|
|
3871
|
+
* To extend: use loadBenchmarkRegistry() / saveBenchmarkRegistry() for CRUD,
|
|
3872
|
+
* discoverBenchmarks() to scan a directory tree for `.agentv/` directories.
|
|
3873
3873
|
*/
|
|
3874
|
-
interface
|
|
3874
|
+
interface BenchmarkEntry {
|
|
3875
3875
|
id: string;
|
|
3876
3876
|
name: string;
|
|
3877
3877
|
path: string;
|
|
3878
3878
|
addedAt: string;
|
|
3879
3879
|
lastOpenedAt: string;
|
|
3880
3880
|
}
|
|
3881
|
-
interface
|
|
3882
|
-
|
|
3881
|
+
interface BenchmarkRegistry {
|
|
3882
|
+
benchmarks: BenchmarkEntry[];
|
|
3883
3883
|
}
|
|
3884
|
-
declare function
|
|
3885
|
-
declare function
|
|
3886
|
-
declare function
|
|
3884
|
+
declare function getBenchmarksRegistryPath(): string;
|
|
3885
|
+
declare function loadBenchmarkRegistry(): BenchmarkRegistry;
|
|
3886
|
+
declare function saveBenchmarkRegistry(registry: BenchmarkRegistry): void;
|
|
3887
3887
|
/**
|
|
3888
|
-
* Derive a URL-safe
|
|
3888
|
+
* Derive a URL-safe benchmark ID from a directory path.
|
|
3889
3889
|
* Uses the directory basename, lowercased, with non-alphanumeric chars replaced by hyphens.
|
|
3890
3890
|
* Appends a numeric suffix if the ID already exists in the registry.
|
|
3891
3891
|
*/
|
|
3892
|
-
declare function
|
|
3892
|
+
declare function deriveBenchmarkId(dirPath: string, existingIds: string[]): string;
|
|
3893
3893
|
/**
|
|
3894
|
-
* Register a
|
|
3894
|
+
* Register a benchmark by path. Returns the new entry, or the existing one if already registered.
|
|
3895
3895
|
* Validates that the path exists and contains a `.agentv/` directory.
|
|
3896
3896
|
*/
|
|
3897
|
-
declare function
|
|
3897
|
+
declare function addBenchmark(benchmarkPath: string): BenchmarkEntry;
|
|
3898
3898
|
/**
|
|
3899
|
-
* Remove a
|
|
3899
|
+
* Remove a benchmark by ID. Returns true if removed, false if not found.
|
|
3900
3900
|
*/
|
|
3901
|
-
declare function
|
|
3901
|
+
declare function removeBenchmark(benchmarkId: string): boolean;
|
|
3902
3902
|
/**
|
|
3903
|
-
* Look up a
|
|
3903
|
+
* Look up a benchmark by ID. Returns undefined if not found.
|
|
3904
3904
|
*/
|
|
3905
|
-
declare function
|
|
3905
|
+
declare function getBenchmark(benchmarkId: string): BenchmarkEntry | undefined;
|
|
3906
3906
|
/**
|
|
3907
|
-
* Update lastOpenedAt for a
|
|
3907
|
+
* Update lastOpenedAt for a benchmark.
|
|
3908
3908
|
*/
|
|
3909
|
-
declare function
|
|
3909
|
+
declare function touchBenchmark(benchmarkId: string): void;
|
|
3910
3910
|
/**
|
|
3911
3911
|
* Scan a directory tree (up to maxDepth levels) for directories containing `.agentv/`.
|
|
3912
|
-
* Returns absolute paths of discovered
|
|
3912
|
+
* Returns absolute paths of discovered benchmark directories.
|
|
3913
3913
|
*/
|
|
3914
|
-
declare function
|
|
3914
|
+
declare function discoverBenchmarks(rootDir: string, maxDepth?: number): string[];
|
|
3915
3915
|
|
|
3916
3916
|
/**
|
|
3917
3917
|
* Trims an EvaluationResult for baseline storage.
|
|
@@ -4372,4 +4372,4 @@ type AgentKernel = {
|
|
|
4372
4372
|
};
|
|
4373
4373
|
declare function createAgentKernel(): AgentKernel;
|
|
4374
4374
|
|
|
4375
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type
|
|
4375
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.js
CHANGED
|
@@ -19179,40 +19179,40 @@ async function createDraftResultsPr(params) {
|
|
|
19179
19179
|
return stdout.trim();
|
|
19180
19180
|
}
|
|
19181
19181
|
|
|
19182
|
-
// src/
|
|
19182
|
+
// src/benchmarks.ts
|
|
19183
19183
|
import { existsSync as existsSync8, mkdirSync as mkdirSync3, readFileSync as readFileSync4, readdirSync as readdirSync3, statSync as statSync2, writeFileSync as writeFileSync2 } from "node:fs";
|
|
19184
19184
|
import path50 from "node:path";
|
|
19185
19185
|
import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
|
|
19186
|
-
function
|
|
19186
|
+
function getBenchmarksRegistryPath() {
|
|
19187
19187
|
return path50.join(getAgentvHome(), "projects.yaml");
|
|
19188
19188
|
}
|
|
19189
|
-
function
|
|
19190
|
-
const registryPath =
|
|
19189
|
+
function loadBenchmarkRegistry() {
|
|
19190
|
+
const registryPath = getBenchmarksRegistryPath();
|
|
19191
19191
|
if (!existsSync8(registryPath)) {
|
|
19192
|
-
return {
|
|
19192
|
+
return { benchmarks: [] };
|
|
19193
19193
|
}
|
|
19194
19194
|
try {
|
|
19195
19195
|
const raw = readFileSync4(registryPath, "utf-8");
|
|
19196
19196
|
const parsed = parseYaml3(raw);
|
|
19197
|
-
if (!parsed || !Array.isArray(parsed.
|
|
19198
|
-
return {
|
|
19197
|
+
if (!parsed || !Array.isArray(parsed.benchmarks)) {
|
|
19198
|
+
return { benchmarks: [] };
|
|
19199
19199
|
}
|
|
19200
|
-
return {
|
|
19200
|
+
return { benchmarks: parsed.benchmarks };
|
|
19201
19201
|
} catch {
|
|
19202
|
-
return {
|
|
19202
|
+
return { benchmarks: [] };
|
|
19203
19203
|
}
|
|
19204
19204
|
}
|
|
19205
|
-
function
|
|
19206
|
-
const registryPath =
|
|
19205
|
+
function saveBenchmarkRegistry(registry) {
|
|
19206
|
+
const registryPath = getBenchmarksRegistryPath();
|
|
19207
19207
|
const dir = path50.dirname(registryPath);
|
|
19208
19208
|
if (!existsSync8(dir)) {
|
|
19209
19209
|
mkdirSync3(dir, { recursive: true });
|
|
19210
19210
|
}
|
|
19211
|
-
writeFileSync2(registryPath, stringifyYaml(registry), "utf-8");
|
|
19211
|
+
writeFileSync2(registryPath, stringifyYaml({ benchmarks: registry.benchmarks }), "utf-8");
|
|
19212
19212
|
}
|
|
19213
|
-
function
|
|
19213
|
+
function deriveBenchmarkId(dirPath, existingIds) {
|
|
19214
19214
|
const base = path50.basename(dirPath).toLowerCase().replace(/[^a-z0-9-]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "");
|
|
19215
|
-
let candidate = base || "
|
|
19215
|
+
let candidate = base || "benchmark";
|
|
19216
19216
|
let suffix = 2;
|
|
19217
19217
|
while (existingIds.includes(candidate)) {
|
|
19218
19218
|
candidate = `${base}-${suffix}`;
|
|
@@ -19220,54 +19220,54 @@ function deriveProjectId(dirPath, existingIds) {
|
|
|
19220
19220
|
}
|
|
19221
19221
|
return candidate;
|
|
19222
19222
|
}
|
|
19223
|
-
function
|
|
19224
|
-
const absPath = path50.resolve(
|
|
19223
|
+
function addBenchmark(benchmarkPath) {
|
|
19224
|
+
const absPath = path50.resolve(benchmarkPath);
|
|
19225
19225
|
if (!existsSync8(absPath)) {
|
|
19226
19226
|
throw new Error(`Directory not found: ${absPath}`);
|
|
19227
19227
|
}
|
|
19228
19228
|
if (!existsSync8(path50.join(absPath, ".agentv"))) {
|
|
19229
19229
|
throw new Error(`No .agentv/ directory found in ${absPath}. Run an evaluation first.`);
|
|
19230
19230
|
}
|
|
19231
|
-
const registry =
|
|
19232
|
-
const existing = registry.
|
|
19231
|
+
const registry = loadBenchmarkRegistry();
|
|
19232
|
+
const existing = registry.benchmarks.find((p) => p.path === absPath);
|
|
19233
19233
|
if (existing) {
|
|
19234
19234
|
return existing;
|
|
19235
19235
|
}
|
|
19236
19236
|
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
19237
19237
|
const entry = {
|
|
19238
|
-
id:
|
|
19238
|
+
id: deriveBenchmarkId(
|
|
19239
19239
|
absPath,
|
|
19240
|
-
registry.
|
|
19240
|
+
registry.benchmarks.map((p) => p.id)
|
|
19241
19241
|
),
|
|
19242
19242
|
name: path50.basename(absPath),
|
|
19243
19243
|
path: absPath,
|
|
19244
19244
|
addedAt: now,
|
|
19245
19245
|
lastOpenedAt: now
|
|
19246
19246
|
};
|
|
19247
|
-
registry.
|
|
19248
|
-
|
|
19247
|
+
registry.benchmarks.push(entry);
|
|
19248
|
+
saveBenchmarkRegistry(registry);
|
|
19249
19249
|
return entry;
|
|
19250
19250
|
}
|
|
19251
|
-
function
|
|
19252
|
-
const registry =
|
|
19253
|
-
const idx = registry.
|
|
19251
|
+
function removeBenchmark(benchmarkId) {
|
|
19252
|
+
const registry = loadBenchmarkRegistry();
|
|
19253
|
+
const idx = registry.benchmarks.findIndex((p) => p.id === benchmarkId);
|
|
19254
19254
|
if (idx < 0) return false;
|
|
19255
|
-
registry.
|
|
19256
|
-
|
|
19255
|
+
registry.benchmarks.splice(idx, 1);
|
|
19256
|
+
saveBenchmarkRegistry(registry);
|
|
19257
19257
|
return true;
|
|
19258
19258
|
}
|
|
19259
|
-
function
|
|
19260
|
-
return
|
|
19259
|
+
function getBenchmark(benchmarkId) {
|
|
19260
|
+
return loadBenchmarkRegistry().benchmarks.find((p) => p.id === benchmarkId);
|
|
19261
19261
|
}
|
|
19262
|
-
function
|
|
19263
|
-
const registry =
|
|
19264
|
-
const entry = registry.
|
|
19262
|
+
function touchBenchmark(benchmarkId) {
|
|
19263
|
+
const registry = loadBenchmarkRegistry();
|
|
19264
|
+
const entry = registry.benchmarks.find((p) => p.id === benchmarkId);
|
|
19265
19265
|
if (entry) {
|
|
19266
19266
|
entry.lastOpenedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
19267
|
-
|
|
19267
|
+
saveBenchmarkRegistry(registry);
|
|
19268
19268
|
}
|
|
19269
19269
|
}
|
|
19270
|
-
function
|
|
19270
|
+
function discoverBenchmarks(rootDir, maxDepth = 2) {
|
|
19271
19271
|
const absRoot = path50.resolve(rootDir);
|
|
19272
19272
|
if (!existsSync8(absRoot) || !statSync2(absRoot).isDirectory()) {
|
|
19273
19273
|
return [];
|
|
@@ -20434,7 +20434,7 @@ export {
|
|
|
20434
20434
|
TranscriptProvider,
|
|
20435
20435
|
WorkspaceCreationError,
|
|
20436
20436
|
WorkspacePoolManager,
|
|
20437
|
-
|
|
20437
|
+
addBenchmark,
|
|
20438
20438
|
assembleLlmGraderPrompt,
|
|
20439
20439
|
assembleLlmGraderPrompt as assembleLlmJudgePrompt,
|
|
20440
20440
|
avgToolDurationMs,
|
|
@@ -20466,17 +20466,17 @@ export {
|
|
|
20466
20466
|
createTempWorkspace,
|
|
20467
20467
|
deepEqual,
|
|
20468
20468
|
defineConfig,
|
|
20469
|
+
deriveBenchmarkId,
|
|
20469
20470
|
deriveCategory,
|
|
20470
|
-
deriveProjectId,
|
|
20471
20471
|
detectFormat,
|
|
20472
20472
|
directorySizeBytes,
|
|
20473
20473
|
discoverAssertions,
|
|
20474
|
+
discoverBenchmarks,
|
|
20474
20475
|
discoverClaudeSessions,
|
|
20475
20476
|
discoverCodexSessions,
|
|
20476
20477
|
discoverCopilotSessions,
|
|
20477
20478
|
discoverGraders,
|
|
20478
20479
|
discoverGraders as discoverJudges,
|
|
20479
|
-
discoverProjects,
|
|
20480
20480
|
discoverProviders,
|
|
20481
20481
|
ensureResultsRepoClone,
|
|
20482
20482
|
ensureVSCodeSubagents,
|
|
@@ -20500,9 +20500,9 @@ export {
|
|
|
20500
20500
|
freeformEvaluationSchema,
|
|
20501
20501
|
generateRubrics,
|
|
20502
20502
|
getAgentvHome,
|
|
20503
|
+
getBenchmark,
|
|
20504
|
+
getBenchmarksRegistryPath,
|
|
20503
20505
|
getOutputFilenames,
|
|
20504
|
-
getProject,
|
|
20505
|
-
getProjectsRegistryPath,
|
|
20506
20506
|
getResultsRepoCachePaths,
|
|
20507
20507
|
getResultsRepoStatus,
|
|
20508
20508
|
getSubagentsRoot,
|
|
@@ -20522,11 +20522,11 @@ export {
|
|
|
20522
20522
|
isTestMessage,
|
|
20523
20523
|
isTestMessageRole,
|
|
20524
20524
|
listTargetNames,
|
|
20525
|
+
loadBenchmarkRegistry,
|
|
20525
20526
|
loadConfig,
|
|
20526
20527
|
loadEvalCaseById,
|
|
20527
20528
|
loadEvalCases,
|
|
20528
20529
|
loadEvalSuite,
|
|
20529
|
-
loadProjectRegistry,
|
|
20530
20530
|
loadTestById,
|
|
20531
20531
|
loadTestSuite,
|
|
20532
20532
|
loadTests,
|
|
@@ -20549,7 +20549,7 @@ export {
|
|
|
20549
20549
|
readTextFile,
|
|
20550
20550
|
readTranscriptFile,
|
|
20551
20551
|
readTranscriptJsonl,
|
|
20552
|
-
|
|
20552
|
+
removeBenchmark,
|
|
20553
20553
|
resolveAndCreateProvider,
|
|
20554
20554
|
resolveDelegatedTargetDefinition,
|
|
20555
20555
|
resolveFileReference,
|
|
@@ -20571,7 +20571,7 @@ export {
|
|
|
20571
20571
|
runIsJsonAssertion,
|
|
20572
20572
|
runRegexAssertion,
|
|
20573
20573
|
runStartsWithAssertion,
|
|
20574
|
-
|
|
20574
|
+
saveBenchmarkRegistry,
|
|
20575
20575
|
scanRepoDeps,
|
|
20576
20576
|
scoreToVerdict,
|
|
20577
20577
|
shouldEnableCache,
|
|
@@ -20588,7 +20588,7 @@ export {
|
|
|
20588
20588
|
toSnakeCaseDeep,
|
|
20589
20589
|
toTranscriptJsonLine,
|
|
20590
20590
|
tokensPerTool,
|
|
20591
|
-
|
|
20591
|
+
touchBenchmark,
|
|
20592
20592
|
transpileEvalYaml,
|
|
20593
20593
|
transpileEvalYamlFile,
|
|
20594
20594
|
trimBaselineResult
|