@agentv/core 4.27.0-next.1 → 4.28.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agentv-provider-AYXH7WLW.js +7 -0
- package/dist/{chunk-JNBHD34F.js → chunk-SCC35F3L.js} +36 -21
- package/dist/chunk-SCC35F3L.js.map +1 -0
- package/dist/{chunk-M5X2KMEA.js → chunk-YDFZ7XN3.js} +2 -2
- package/dist/chunk-YDFZ7XN3.js.map +1 -0
- package/dist/{chunk-7LQI7772.js → chunk-YFXMMBUG.js} +4 -2
- package/dist/chunk-YFXMMBUG.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +85 -22
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +43 -1
- package/dist/index.d.ts +43 -1
- package/dist/index.js +50 -8
- package/dist/index.js.map +1 -1
- package/dist/ts-eval-loader-EMSGL2BQ.js +12 -0
- package/package.json +4 -4
- package/dist/agentv-provider-7AMUD2GX.js +0 -7
- package/dist/chunk-7LQI7772.js.map +0 -1
- package/dist/chunk-JNBHD34F.js.map +0 -1
- package/dist/chunk-M5X2KMEA.js.map +0 -1
- package/dist/ts-eval-loader-BZ54W52K.js +0 -12
- /package/dist/{agentv-provider-7AMUD2GX.js.map → agentv-provider-AYXH7WLW.js.map} +0 -0
- /package/dist/{ts-eval-loader-BZ54W52K.js.map → ts-eval-loader-EMSGL2BQ.js.map} +0 -0
package/dist/index.d.cts
CHANGED
|
@@ -1796,6 +1796,8 @@ interface ClaudeResolvedConfig {
|
|
|
1796
1796
|
readonly logFormat?: 'summary' | 'json';
|
|
1797
1797
|
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1798
1798
|
readonly streamLog?: false | 'raw' | 'summary';
|
|
1799
|
+
/** When true (default), passes --dangerously-skip-permissions to the Claude CLI. Matches ClaudeSdkProvider behavior. */
|
|
1800
|
+
readonly bypassPermissions?: boolean;
|
|
1799
1801
|
}
|
|
1800
1802
|
interface MockResolvedConfig {
|
|
1801
1803
|
readonly response?: string;
|
|
@@ -4269,9 +4271,16 @@ declare function getWorkspacePoolRoot(): string;
|
|
|
4269
4271
|
* - id: my-app
|
|
4270
4272
|
* name: My App
|
|
4271
4273
|
* path: /home/user/projects/my-app
|
|
4274
|
+
* source:
|
|
4275
|
+
* url: ${{ BENCHMARK_REPO_URL }}
|
|
4276
|
+
* ref: ${{ BENCHMARK_REPO_REF:-main }}
|
|
4272
4277
|
* added_at: "2026-03-20T10:00:00Z"
|
|
4273
4278
|
* last_opened_at: "2026-03-30T14:00:00Z"
|
|
4274
4279
|
*
|
|
4280
|
+
* The optional `source` field enables remote sync via syncBenchmarks():
|
|
4281
|
+
* first run — git clone --depth 1 --filter=blob:none
|
|
4282
|
+
* subsequent runs — git pull --ff-only
|
|
4283
|
+
*
|
|
4275
4284
|
* Concurrency: the registry assumes a single writer. All mutating calls
|
|
4276
4285
|
* (add/remove/touchBenchmark) do read-modify-write on benchmarks.yaml
|
|
4277
4286
|
* without a lock. Studio's HTTP handlers are serialized by Node's
|
|
@@ -4284,12 +4293,17 @@ declare function getWorkspacePoolRoot(): string;
|
|
|
4284
4293
|
* - discoverBenchmarks() is a one-shot filesystem utility for bulk
|
|
4285
4294
|
* registration; it does not run in the request path.
|
|
4286
4295
|
*/
|
|
4296
|
+
interface BenchmarkSource {
|
|
4297
|
+
url: string;
|
|
4298
|
+
ref: string;
|
|
4299
|
+
}
|
|
4287
4300
|
interface BenchmarkEntry {
|
|
4288
4301
|
id: string;
|
|
4289
4302
|
name: string;
|
|
4290
4303
|
path: string;
|
|
4291
4304
|
addedAt: string;
|
|
4292
4305
|
lastOpenedAt: string;
|
|
4306
|
+
source?: BenchmarkSource;
|
|
4293
4307
|
}
|
|
4294
4308
|
interface BenchmarkRegistry {
|
|
4295
4309
|
benchmarks: BenchmarkEntry[];
|
|
@@ -4328,6 +4342,34 @@ declare function touchBenchmark(benchmarkId: string): void;
|
|
|
4328
4342
|
*/
|
|
4329
4343
|
declare function discoverBenchmarks(rootDir: string, maxDepth?: number): string[];
|
|
4330
4344
|
|
|
4345
|
+
/**
|
|
4346
|
+
* Benchmark sync — pulls remote git repos to the local path declared in the
|
|
4347
|
+
* benchmark registry before Studio/eval startup.
|
|
4348
|
+
*
|
|
4349
|
+
* Sync is oneshot only, triggered by the Studio UI "Sync" button or the
|
|
4350
|
+
* `agentv benchmark sync` CLI command. There is no daemon or continuous mode.
|
|
4351
|
+
*
|
|
4352
|
+
* First run — git clone --depth 1 --filter=blob:none --branch <ref> <url> <path>
|
|
4353
|
+
* Subsequent — git pull --ff-only (when <path>/.git already exists)
|
|
4354
|
+
*
|
|
4355
|
+
* Usage:
|
|
4356
|
+
* import { syncBenchmarks } from './benchmark-sync.js';
|
|
4357
|
+
* await syncBenchmarks(registry.benchmarks);
|
|
4358
|
+
*/
|
|
4359
|
+
|
|
4360
|
+
/**
|
|
4361
|
+
* Clone or pull a single benchmark entry from its declared source.
|
|
4362
|
+
* - No .git present: shallow clone into entry.path.
|
|
4363
|
+
* - .git present: git pull --ff-only to update in place.
|
|
4364
|
+
* Throws on git error or missing source.
|
|
4365
|
+
*/
|
|
4366
|
+
declare function syncBenchmark(entry: BenchmarkEntry): Promise<void>;
|
|
4367
|
+
/**
|
|
4368
|
+
* Iterate benchmark entries and sync any that have a source declared.
|
|
4369
|
+
* Entries without source are skipped silently.
|
|
4370
|
+
*/
|
|
4371
|
+
declare function syncBenchmarks(entries: BenchmarkEntry[]): Promise<void>;
|
|
4372
|
+
|
|
4331
4373
|
/**
|
|
4332
4374
|
* Trims an EvaluationResult for baseline storage.
|
|
4333
4375
|
* Strips large debug/audit fields (denylist approach) while preserving
|
|
@@ -4900,4 +4942,4 @@ type AgentKernel = {
|
|
|
4900
4942
|
};
|
|
4901
4943
|
declare function createAgentKernel(): AgentKernel;
|
|
4902
4944
|
|
|
4903
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceEnvConfig, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directPushResults, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, killAllTrackedChildren, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, parseYamlValue, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, trackChild, trackedChildCount, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
4945
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, type BenchmarkSource, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceEnvConfig, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directPushResults, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, killAllTrackedChildren, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, parseYamlValue, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncBenchmark, syncBenchmarks, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, trackChild, trackedChildCount, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -1796,6 +1796,8 @@ interface ClaudeResolvedConfig {
|
|
|
1796
1796
|
readonly logFormat?: 'summary' | 'json';
|
|
1797
1797
|
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1798
1798
|
readonly streamLog?: false | 'raw' | 'summary';
|
|
1799
|
+
/** When true (default), passes --dangerously-skip-permissions to the Claude CLI. Matches ClaudeSdkProvider behavior. */
|
|
1800
|
+
readonly bypassPermissions?: boolean;
|
|
1799
1801
|
}
|
|
1800
1802
|
interface MockResolvedConfig {
|
|
1801
1803
|
readonly response?: string;
|
|
@@ -4269,9 +4271,16 @@ declare function getWorkspacePoolRoot(): string;
|
|
|
4269
4271
|
* - id: my-app
|
|
4270
4272
|
* name: My App
|
|
4271
4273
|
* path: /home/user/projects/my-app
|
|
4274
|
+
* source:
|
|
4275
|
+
* url: ${{ BENCHMARK_REPO_URL }}
|
|
4276
|
+
* ref: ${{ BENCHMARK_REPO_REF:-main }}
|
|
4272
4277
|
* added_at: "2026-03-20T10:00:00Z"
|
|
4273
4278
|
* last_opened_at: "2026-03-30T14:00:00Z"
|
|
4274
4279
|
*
|
|
4280
|
+
* The optional `source` field enables remote sync via syncBenchmarks():
|
|
4281
|
+
* first run — git clone --depth 1 --filter=blob:none
|
|
4282
|
+
* subsequent runs — git pull --ff-only
|
|
4283
|
+
*
|
|
4275
4284
|
* Concurrency: the registry assumes a single writer. All mutating calls
|
|
4276
4285
|
* (add/remove/touchBenchmark) do read-modify-write on benchmarks.yaml
|
|
4277
4286
|
* without a lock. Studio's HTTP handlers are serialized by Node's
|
|
@@ -4284,12 +4293,17 @@ declare function getWorkspacePoolRoot(): string;
|
|
|
4284
4293
|
* - discoverBenchmarks() is a one-shot filesystem utility for bulk
|
|
4285
4294
|
* registration; it does not run in the request path.
|
|
4286
4295
|
*/
|
|
4296
|
+
interface BenchmarkSource {
|
|
4297
|
+
url: string;
|
|
4298
|
+
ref: string;
|
|
4299
|
+
}
|
|
4287
4300
|
interface BenchmarkEntry {
|
|
4288
4301
|
id: string;
|
|
4289
4302
|
name: string;
|
|
4290
4303
|
path: string;
|
|
4291
4304
|
addedAt: string;
|
|
4292
4305
|
lastOpenedAt: string;
|
|
4306
|
+
source?: BenchmarkSource;
|
|
4293
4307
|
}
|
|
4294
4308
|
interface BenchmarkRegistry {
|
|
4295
4309
|
benchmarks: BenchmarkEntry[];
|
|
@@ -4328,6 +4342,34 @@ declare function touchBenchmark(benchmarkId: string): void;
|
|
|
4328
4342
|
*/
|
|
4329
4343
|
declare function discoverBenchmarks(rootDir: string, maxDepth?: number): string[];
|
|
4330
4344
|
|
|
4345
|
+
/**
|
|
4346
|
+
* Benchmark sync — pulls remote git repos to the local path declared in the
|
|
4347
|
+
* benchmark registry before Studio/eval startup.
|
|
4348
|
+
*
|
|
4349
|
+
* Sync is oneshot only, triggered by the Studio UI "Sync" button or the
|
|
4350
|
+
* `agentv benchmark sync` CLI command. There is no daemon or continuous mode.
|
|
4351
|
+
*
|
|
4352
|
+
* First run — git clone --depth 1 --filter=blob:none --branch <ref> <url> <path>
|
|
4353
|
+
* Subsequent — git pull --ff-only (when <path>/.git already exists)
|
|
4354
|
+
*
|
|
4355
|
+
* Usage:
|
|
4356
|
+
* import { syncBenchmarks } from './benchmark-sync.js';
|
|
4357
|
+
* await syncBenchmarks(registry.benchmarks);
|
|
4358
|
+
*/
|
|
4359
|
+
|
|
4360
|
+
/**
|
|
4361
|
+
* Clone or pull a single benchmark entry from its declared source.
|
|
4362
|
+
* - No .git present: shallow clone into entry.path.
|
|
4363
|
+
* - .git present: git pull --ff-only to update in place.
|
|
4364
|
+
* Throws on git error or missing source.
|
|
4365
|
+
*/
|
|
4366
|
+
declare function syncBenchmark(entry: BenchmarkEntry): Promise<void>;
|
|
4367
|
+
/**
|
|
4368
|
+
* Iterate benchmark entries and sync any that have a source declared.
|
|
4369
|
+
* Entries without source are skipped silently.
|
|
4370
|
+
*/
|
|
4371
|
+
declare function syncBenchmarks(entries: BenchmarkEntry[]): Promise<void>;
|
|
4372
|
+
|
|
4331
4373
|
/**
|
|
4332
4374
|
* Trims an EvaluationResult for baseline storage.
|
|
4333
4375
|
* Strips large debug/audit fields (denylist approach) while preserving
|
|
@@ -4900,4 +4942,4 @@ type AgentKernel = {
|
|
|
4900
4942
|
};
|
|
4901
4943
|
declare function createAgentKernel(): AgentKernel;
|
|
4902
4944
|
|
|
4903
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceEnvConfig, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directPushResults, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, killAllTrackedChildren, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, parseYamlValue, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, trackChild, trackedChildCount, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
4945
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, type BenchmarkSource, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceEnvConfig, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directPushResults, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, killAllTrackedChildren, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, parseYamlValue, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncBenchmark, syncBenchmarks, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, trackChild, trackedChildCount, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.js
CHANGED
|
@@ -132,7 +132,7 @@ import {
|
|
|
132
132
|
tokensPerTool,
|
|
133
133
|
trackChild,
|
|
134
134
|
trackedChildCount
|
|
135
|
-
} from "./chunk-
|
|
135
|
+
} from "./chunk-SCC35F3L.js";
|
|
136
136
|
import {
|
|
137
137
|
COMMON_TARGET_SETTINGS,
|
|
138
138
|
TEST_MESSAGE_ROLES,
|
|
@@ -157,9 +157,9 @@ import {
|
|
|
157
157
|
resolveDelegatedTargetDefinition,
|
|
158
158
|
resolveFileReference,
|
|
159
159
|
resolveTargetDefinition
|
|
160
|
-
} from "./chunk-
|
|
160
|
+
} from "./chunk-YFXMMBUG.js";
|
|
161
161
|
import "./chunk-3WGHC7LC.js";
|
|
162
|
-
import "./chunk-
|
|
162
|
+
import "./chunk-YDFZ7XN3.js";
|
|
163
163
|
import {
|
|
164
164
|
OtlpJsonFileExporter
|
|
165
165
|
} from "./chunk-KPSI5CSL.js";
|
|
@@ -486,12 +486,12 @@ var CONFIG_FILE_NAMES = [
|
|
|
486
486
|
".agentv/config.js"
|
|
487
487
|
];
|
|
488
488
|
async function loadTsConfig(projectRoot) {
|
|
489
|
-
const { existsSync:
|
|
489
|
+
const { existsSync: existsSync4 } = await import("node:fs");
|
|
490
490
|
const { pathToFileURL } = await import("node:url");
|
|
491
491
|
const { join } = await import("node:path");
|
|
492
492
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
493
493
|
const filePath = join(projectRoot, fileName);
|
|
494
|
-
if (!
|
|
494
|
+
if (!existsSync4(filePath)) {
|
|
495
495
|
continue;
|
|
496
496
|
}
|
|
497
497
|
try {
|
|
@@ -1053,22 +1053,33 @@ function fromYaml(raw) {
|
|
|
1053
1053
|
if (typeof e.id !== "string" || typeof e.name !== "string" || typeof e.path !== "string") {
|
|
1054
1054
|
return null;
|
|
1055
1055
|
}
|
|
1056
|
-
|
|
1056
|
+
const entry = {
|
|
1057
1057
|
id: e.id,
|
|
1058
1058
|
name: e.name,
|
|
1059
1059
|
path: e.path,
|
|
1060
1060
|
addedAt: typeof e.added_at === "string" ? e.added_at : "",
|
|
1061
1061
|
lastOpenedAt: typeof e.last_opened_at === "string" ? e.last_opened_at : ""
|
|
1062
1062
|
};
|
|
1063
|
+
if (e.source && typeof e.source === "object") {
|
|
1064
|
+
const s = e.source;
|
|
1065
|
+
if (typeof s.url === "string" && typeof s.ref === "string") {
|
|
1066
|
+
entry.source = { url: s.url, ref: s.ref };
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
return entry;
|
|
1063
1070
|
}
|
|
1064
1071
|
function toYaml(entry) {
|
|
1065
|
-
|
|
1072
|
+
const yaml = {
|
|
1066
1073
|
id: entry.id,
|
|
1067
1074
|
name: entry.name,
|
|
1068
1075
|
path: entry.path,
|
|
1069
1076
|
added_at: entry.addedAt,
|
|
1070
1077
|
last_opened_at: entry.lastOpenedAt
|
|
1071
1078
|
};
|
|
1079
|
+
if (entry.source) {
|
|
1080
|
+
yaml.source = { url: entry.source.url, ref: entry.source.ref };
|
|
1081
|
+
}
|
|
1082
|
+
return yaml;
|
|
1072
1083
|
}
|
|
1073
1084
|
function loadBenchmarkRegistry() {
|
|
1074
1085
|
const registryPath = getBenchmarksRegistryPath();
|
|
@@ -1081,7 +1092,8 @@ function loadBenchmarkRegistry() {
|
|
|
1081
1092
|
if (!parsed || typeof parsed !== "object") {
|
|
1082
1093
|
return { benchmarks: [] };
|
|
1083
1094
|
}
|
|
1084
|
-
const
|
|
1095
|
+
const env = process.env;
|
|
1096
|
+
const benchmarks = Array.isArray(parsed.benchmarks) ? parsed.benchmarks.map((e) => fromYaml(interpolateEnv(e, env))).filter((e) => e !== null) : [];
|
|
1085
1097
|
return { benchmarks };
|
|
1086
1098
|
} catch {
|
|
1087
1099
|
return { benchmarks: [] };
|
|
@@ -1180,6 +1192,34 @@ function discoverBenchmarks(rootDir, maxDepth = 2) {
|
|
|
1180
1192
|
return results.sort();
|
|
1181
1193
|
}
|
|
1182
1194
|
|
|
1195
|
+
// src/benchmark-sync.ts
|
|
1196
|
+
import * as childProcess from "node:child_process";
|
|
1197
|
+
import { existsSync as existsSync3 } from "node:fs";
|
|
1198
|
+
async function syncBenchmark(entry) {
|
|
1199
|
+
if (!entry.source) {
|
|
1200
|
+
throw new Error(`Benchmark '${entry.id}' has no source defined`);
|
|
1201
|
+
}
|
|
1202
|
+
const { url, ref } = entry.source;
|
|
1203
|
+
const dest = entry.path;
|
|
1204
|
+
if (existsSync3(`${dest}/.git`)) {
|
|
1205
|
+
childProcess.execFileSync("git", ["-C", dest, "pull", "--ff-only"], { stdio: "inherit" });
|
|
1206
|
+
} else {
|
|
1207
|
+
childProcess.execFileSync(
|
|
1208
|
+
"git",
|
|
1209
|
+
["clone", "--depth", "1", "--filter=blob:none", "--branch", ref, url, dest],
|
|
1210
|
+
{ stdio: "inherit" }
|
|
1211
|
+
);
|
|
1212
|
+
}
|
|
1213
|
+
}
|
|
1214
|
+
async function syncBenchmarks(entries) {
|
|
1215
|
+
for (const entry of entries) {
|
|
1216
|
+
if (!entry.source) continue;
|
|
1217
|
+
console.log(`Syncing benchmark '${entry.id}' from ${entry.source.url}...`);
|
|
1218
|
+
await syncBenchmark(entry);
|
|
1219
|
+
console.log(`Benchmark '${entry.id}' synced.`);
|
|
1220
|
+
}
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1183
1223
|
// src/evaluation/baseline.ts
|
|
1184
1224
|
var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
|
|
1185
1225
|
"requests",
|
|
@@ -2629,6 +2669,8 @@ export {
|
|
|
2629
2669
|
subscribeToCopilotSdkLogEntries,
|
|
2630
2670
|
subscribeToPiLogEntries,
|
|
2631
2671
|
substituteVariables,
|
|
2672
|
+
syncBenchmark,
|
|
2673
|
+
syncBenchmarks,
|
|
2632
2674
|
syncResultsRepo,
|
|
2633
2675
|
toCamelCaseDeep,
|
|
2634
2676
|
toSnakeCaseDeep,
|