@agentv/core 4.21.0-next.1 → 4.22.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-WCW3V6QJ.js → chunk-B3BLJRYI.js} +26 -4
- package/dist/chunk-B3BLJRYI.js.map +1 -0
- package/dist/index.cjs +96 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +107 -1
- package/dist/index.d.ts +107 -1
- package/dist/index.js +72 -1
- package/dist/index.js.map +1 -1
- package/dist/{ts-eval-loader-HPIPE72C.js → ts-eval-loader-PA4YFM5D.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-WCW3V6QJ.js.map +0 -1
- /package/dist/{ts-eval-loader-HPIPE72C.js.map → ts-eval-loader-PA4YFM5D.js.map} +0 -0
package/dist/index.d.cts
CHANGED
|
@@ -1912,6 +1912,10 @@ type ResultsExportConfig = {
|
|
|
1912
1912
|
readonly auto_push?: boolean;
|
|
1913
1913
|
readonly branch_prefix?: string;
|
|
1914
1914
|
};
|
|
1915
|
+
type HooksConfig = {
|
|
1916
|
+
/** Shell command to run once at agentv startup. stdout is parsed for env var exports. */
|
|
1917
|
+
readonly before_session?: string;
|
|
1918
|
+
};
|
|
1915
1919
|
type AgentVConfig$1 = {
|
|
1916
1920
|
readonly required_version?: string;
|
|
1917
1921
|
readonly eval_patterns?: readonly string[];
|
|
@@ -1919,6 +1923,7 @@ type AgentVConfig$1 = {
|
|
|
1919
1923
|
readonly results?: {
|
|
1920
1924
|
readonly export?: ResultsExportConfig;
|
|
1921
1925
|
};
|
|
1926
|
+
readonly hooks?: HooksConfig;
|
|
1922
1927
|
};
|
|
1923
1928
|
/**
|
|
1924
1929
|
* Load optional .agentv/config.yaml configuration file.
|
|
@@ -2937,6 +2942,36 @@ declare const rubricEvaluationSchema: z.ZodObject<{
|
|
|
2937
2942
|
}[];
|
|
2938
2943
|
overall_reasoning: string;
|
|
2939
2944
|
}>;
|
|
2945
|
+
declare const scoreRangeEvaluationSchema: z.ZodObject<{
|
|
2946
|
+
checks: z.ZodArray<z.ZodObject<{
|
|
2947
|
+
id: z.ZodString;
|
|
2948
|
+
score: z.ZodNumber;
|
|
2949
|
+
reasoning: z.ZodOptional<z.ZodString>;
|
|
2950
|
+
}, "strip", z.ZodTypeAny, {
|
|
2951
|
+
id: string;
|
|
2952
|
+
score: number;
|
|
2953
|
+
reasoning?: string | undefined;
|
|
2954
|
+
}, {
|
|
2955
|
+
id: string;
|
|
2956
|
+
score: number;
|
|
2957
|
+
reasoning?: string | undefined;
|
|
2958
|
+
}>, "many">;
|
|
2959
|
+
overall_reasoning: z.ZodOptional<z.ZodString>;
|
|
2960
|
+
}, "strip", z.ZodTypeAny, {
|
|
2961
|
+
checks: {
|
|
2962
|
+
id: string;
|
|
2963
|
+
score: number;
|
|
2964
|
+
reasoning?: string | undefined;
|
|
2965
|
+
}[];
|
|
2966
|
+
overall_reasoning?: string | undefined;
|
|
2967
|
+
}, {
|
|
2968
|
+
checks: {
|
|
2969
|
+
id: string;
|
|
2970
|
+
score: number;
|
|
2971
|
+
reasoning?: string | undefined;
|
|
2972
|
+
}[];
|
|
2973
|
+
overall_reasoning?: string | undefined;
|
|
2974
|
+
}>;
|
|
2940
2975
|
|
|
2941
2976
|
declare class LlmGrader implements Grader {
|
|
2942
2977
|
readonly kind = "llm-grader";
|
|
@@ -3537,6 +3572,21 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
3537
3572
|
maxDurationMs?: number | undefined;
|
|
3538
3573
|
maxCostUsd?: number | undefined;
|
|
3539
3574
|
}>>;
|
|
3575
|
+
/** Lifecycle hooks */
|
|
3576
|
+
hooks: z.ZodOptional<z.ZodObject<{
|
|
3577
|
+
/**
|
|
3578
|
+
* Shell command to run once at agentv startup, before any command executes.
|
|
3579
|
+
* stdout is parsed for env var exports (`KEY=value` or `export KEY="value"`)
|
|
3580
|
+
* and injected into process.env. Keys already set in the environment are
|
|
3581
|
+
* not overwritten — existing env always takes priority.
|
|
3582
|
+
* stderr is forwarded to the user. Non-zero exit aborts with an error.
|
|
3583
|
+
*/
|
|
3584
|
+
beforeSession: z.ZodOptional<z.ZodString>;
|
|
3585
|
+
}, "strip", z.ZodTypeAny, {
|
|
3586
|
+
beforeSession?: string | undefined;
|
|
3587
|
+
}, {
|
|
3588
|
+
beforeSession?: string | undefined;
|
|
3589
|
+
}>>;
|
|
3540
3590
|
}, "strip", z.ZodTypeAny, {
|
|
3541
3591
|
execution?: {
|
|
3542
3592
|
workers?: number | undefined;
|
|
@@ -3546,6 +3596,9 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
3546
3596
|
keepWorkspaces?: boolean | undefined;
|
|
3547
3597
|
otelFile?: string | undefined;
|
|
3548
3598
|
} | undefined;
|
|
3599
|
+
hooks?: {
|
|
3600
|
+
beforeSession?: string | undefined;
|
|
3601
|
+
} | undefined;
|
|
3549
3602
|
cache?: {
|
|
3550
3603
|
enabled?: boolean | undefined;
|
|
3551
3604
|
path?: string | undefined;
|
|
@@ -3567,6 +3620,9 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
3567
3620
|
keepWorkspaces?: boolean | undefined;
|
|
3568
3621
|
otelFile?: string | undefined;
|
|
3569
3622
|
} | undefined;
|
|
3623
|
+
hooks?: {
|
|
3624
|
+
beforeSession?: string | undefined;
|
|
3625
|
+
} | undefined;
|
|
3570
3626
|
cache?: {
|
|
3571
3627
|
enabled?: boolean | undefined;
|
|
3572
3628
|
path?: string | undefined;
|
|
@@ -4346,6 +4402,56 @@ declare function discoverAssertions(registry: GraderRegistry, baseDir: string):
|
|
|
4346
4402
|
*/
|
|
4347
4403
|
declare function discoverGraders(registry: GraderRegistry, baseDir: string): Promise<string[]>;
|
|
4348
4404
|
|
|
4405
|
+
/**
|
|
4406
|
+
* Session hook execution for AgentV.
|
|
4407
|
+
*
|
|
4408
|
+
* Runs a shell command once at agentv startup and injects exported environment
|
|
4409
|
+
* variables into the current process. This lets projects fetch secrets at
|
|
4410
|
+
* runtime (e.g. from a vault) without needing a wrapper script.
|
|
4411
|
+
*
|
|
4412
|
+
* ## How it works
|
|
4413
|
+
*
|
|
4414
|
+
* 1. The command is run via `sh -c` (or `cmd /c` on Windows).
|
|
4415
|
+
* 2. stdout is captured and parsed for env var exports.
|
|
4416
|
+
* 3. stderr is forwarded to the process stderr so the user sees output.
|
|
4417
|
+
* 4. Non-zero exit aborts with a clear error.
|
|
4418
|
+
* 5. Parsed keys are injected into `process.env` — only for keys not already
|
|
4419
|
+
* set, so existing env always wins.
|
|
4420
|
+
*
|
|
4421
|
+
* ## Supported output formats
|
|
4422
|
+
*
|
|
4423
|
+
* Both shell-export and dotenv formats are accepted:
|
|
4424
|
+
* export KEY="value" (shell export — quotes optional)
|
|
4425
|
+
* KEY=value (dotenv — no export prefix)
|
|
4426
|
+
*
|
|
4427
|
+
* Lines that don't match either pattern are silently ignored.
|
|
4428
|
+
*
|
|
4429
|
+
* @module
|
|
4430
|
+
*/
|
|
4431
|
+
/**
|
|
4432
|
+
* Parse env var lines from hook stdout.
|
|
4433
|
+
*
|
|
4434
|
+
* Accepts:
|
|
4435
|
+
* export KEY="value" → { KEY: "value" }
|
|
4436
|
+
* export KEY=value → { KEY: "value" }
|
|
4437
|
+
* KEY=value → { KEY: "value" }
|
|
4438
|
+
*
|
|
4439
|
+
* Strips surrounding single or double quotes from values.
|
|
4440
|
+
* Skips lines with empty keys or values that look like shell syntax.
|
|
4441
|
+
*/
|
|
4442
|
+
declare function parseEnvOutput(stdout: string): Record<string, string>;
|
|
4443
|
+
/**
|
|
4444
|
+
* Run the before_session hook command and inject exported env vars into process.env.
|
|
4445
|
+
*
|
|
4446
|
+
* - Runs via shell (`sh -c` on POSIX, `cmd /c` on Windows)
|
|
4447
|
+
* - Captured stdout is parsed for env vars; stderr is forwarded to process.stderr
|
|
4448
|
+
* - Non-zero exit throws an Error with the command and exit code
|
|
4449
|
+
* - Keys already set in process.env are NOT overwritten
|
|
4450
|
+
*
|
|
4451
|
+
* @param command Shell command string to execute
|
|
4452
|
+
*/
|
|
4453
|
+
declare function runBeforeSessionHook(command: string): void;
|
|
4454
|
+
|
|
4349
4455
|
/**
|
|
4350
4456
|
* Core types for the transcript import pipeline.
|
|
4351
4457
|
*
|
|
@@ -4663,4 +4769,4 @@ type AgentKernel = {
|
|
|
4663
4769
|
};
|
|
4664
4770
|
declare function createAgentKernel(): AgentKernel;
|
|
4665
4771
|
|
|
4666
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
4772
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -1912,6 +1912,10 @@ type ResultsExportConfig = {
|
|
|
1912
1912
|
readonly auto_push?: boolean;
|
|
1913
1913
|
readonly branch_prefix?: string;
|
|
1914
1914
|
};
|
|
1915
|
+
type HooksConfig = {
|
|
1916
|
+
/** Shell command to run once at agentv startup. stdout is parsed for env var exports. */
|
|
1917
|
+
readonly before_session?: string;
|
|
1918
|
+
};
|
|
1915
1919
|
type AgentVConfig$1 = {
|
|
1916
1920
|
readonly required_version?: string;
|
|
1917
1921
|
readonly eval_patterns?: readonly string[];
|
|
@@ -1919,6 +1923,7 @@ type AgentVConfig$1 = {
|
|
|
1919
1923
|
readonly results?: {
|
|
1920
1924
|
readonly export?: ResultsExportConfig;
|
|
1921
1925
|
};
|
|
1926
|
+
readonly hooks?: HooksConfig;
|
|
1922
1927
|
};
|
|
1923
1928
|
/**
|
|
1924
1929
|
* Load optional .agentv/config.yaml configuration file.
|
|
@@ -2937,6 +2942,36 @@ declare const rubricEvaluationSchema: z.ZodObject<{
|
|
|
2937
2942
|
}[];
|
|
2938
2943
|
overall_reasoning: string;
|
|
2939
2944
|
}>;
|
|
2945
|
+
declare const scoreRangeEvaluationSchema: z.ZodObject<{
|
|
2946
|
+
checks: z.ZodArray<z.ZodObject<{
|
|
2947
|
+
id: z.ZodString;
|
|
2948
|
+
score: z.ZodNumber;
|
|
2949
|
+
reasoning: z.ZodOptional<z.ZodString>;
|
|
2950
|
+
}, "strip", z.ZodTypeAny, {
|
|
2951
|
+
id: string;
|
|
2952
|
+
score: number;
|
|
2953
|
+
reasoning?: string | undefined;
|
|
2954
|
+
}, {
|
|
2955
|
+
id: string;
|
|
2956
|
+
score: number;
|
|
2957
|
+
reasoning?: string | undefined;
|
|
2958
|
+
}>, "many">;
|
|
2959
|
+
overall_reasoning: z.ZodOptional<z.ZodString>;
|
|
2960
|
+
}, "strip", z.ZodTypeAny, {
|
|
2961
|
+
checks: {
|
|
2962
|
+
id: string;
|
|
2963
|
+
score: number;
|
|
2964
|
+
reasoning?: string | undefined;
|
|
2965
|
+
}[];
|
|
2966
|
+
overall_reasoning?: string | undefined;
|
|
2967
|
+
}, {
|
|
2968
|
+
checks: {
|
|
2969
|
+
id: string;
|
|
2970
|
+
score: number;
|
|
2971
|
+
reasoning?: string | undefined;
|
|
2972
|
+
}[];
|
|
2973
|
+
overall_reasoning?: string | undefined;
|
|
2974
|
+
}>;
|
|
2940
2975
|
|
|
2941
2976
|
declare class LlmGrader implements Grader {
|
|
2942
2977
|
readonly kind = "llm-grader";
|
|
@@ -3537,6 +3572,21 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
3537
3572
|
maxDurationMs?: number | undefined;
|
|
3538
3573
|
maxCostUsd?: number | undefined;
|
|
3539
3574
|
}>>;
|
|
3575
|
+
/** Lifecycle hooks */
|
|
3576
|
+
hooks: z.ZodOptional<z.ZodObject<{
|
|
3577
|
+
/**
|
|
3578
|
+
* Shell command to run once at agentv startup, before any command executes.
|
|
3579
|
+
* stdout is parsed for env var exports (`KEY=value` or `export KEY="value"`)
|
|
3580
|
+
* and injected into process.env. Keys already set in the environment are
|
|
3581
|
+
* not overwritten — existing env always takes priority.
|
|
3582
|
+
* stderr is forwarded to the user. Non-zero exit aborts with an error.
|
|
3583
|
+
*/
|
|
3584
|
+
beforeSession: z.ZodOptional<z.ZodString>;
|
|
3585
|
+
}, "strip", z.ZodTypeAny, {
|
|
3586
|
+
beforeSession?: string | undefined;
|
|
3587
|
+
}, {
|
|
3588
|
+
beforeSession?: string | undefined;
|
|
3589
|
+
}>>;
|
|
3540
3590
|
}, "strip", z.ZodTypeAny, {
|
|
3541
3591
|
execution?: {
|
|
3542
3592
|
workers?: number | undefined;
|
|
@@ -3546,6 +3596,9 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
3546
3596
|
keepWorkspaces?: boolean | undefined;
|
|
3547
3597
|
otelFile?: string | undefined;
|
|
3548
3598
|
} | undefined;
|
|
3599
|
+
hooks?: {
|
|
3600
|
+
beforeSession?: string | undefined;
|
|
3601
|
+
} | undefined;
|
|
3549
3602
|
cache?: {
|
|
3550
3603
|
enabled?: boolean | undefined;
|
|
3551
3604
|
path?: string | undefined;
|
|
@@ -3567,6 +3620,9 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
3567
3620
|
keepWorkspaces?: boolean | undefined;
|
|
3568
3621
|
otelFile?: string | undefined;
|
|
3569
3622
|
} | undefined;
|
|
3623
|
+
hooks?: {
|
|
3624
|
+
beforeSession?: string | undefined;
|
|
3625
|
+
} | undefined;
|
|
3570
3626
|
cache?: {
|
|
3571
3627
|
enabled?: boolean | undefined;
|
|
3572
3628
|
path?: string | undefined;
|
|
@@ -4346,6 +4402,56 @@ declare function discoverAssertions(registry: GraderRegistry, baseDir: string):
|
|
|
4346
4402
|
*/
|
|
4347
4403
|
declare function discoverGraders(registry: GraderRegistry, baseDir: string): Promise<string[]>;
|
|
4348
4404
|
|
|
4405
|
+
/**
|
|
4406
|
+
* Session hook execution for AgentV.
|
|
4407
|
+
*
|
|
4408
|
+
* Runs a shell command once at agentv startup and injects exported environment
|
|
4409
|
+
* variables into the current process. This lets projects fetch secrets at
|
|
4410
|
+
* runtime (e.g. from a vault) without needing a wrapper script.
|
|
4411
|
+
*
|
|
4412
|
+
* ## How it works
|
|
4413
|
+
*
|
|
4414
|
+
* 1. The command is run via `sh -c` (or `cmd /c` on Windows).
|
|
4415
|
+
* 2. stdout is captured and parsed for env var exports.
|
|
4416
|
+
* 3. stderr is forwarded to the process stderr so the user sees output.
|
|
4417
|
+
* 4. Non-zero exit aborts with a clear error.
|
|
4418
|
+
* 5. Parsed keys are injected into `process.env` — only for keys not already
|
|
4419
|
+
* set, so existing env always wins.
|
|
4420
|
+
*
|
|
4421
|
+
* ## Supported output formats
|
|
4422
|
+
*
|
|
4423
|
+
* Both shell-export and dotenv formats are accepted:
|
|
4424
|
+
* export KEY="value" (shell export — quotes optional)
|
|
4425
|
+
* KEY=value (dotenv — no export prefix)
|
|
4426
|
+
*
|
|
4427
|
+
* Lines that don't match either pattern are silently ignored.
|
|
4428
|
+
*
|
|
4429
|
+
* @module
|
|
4430
|
+
*/
|
|
4431
|
+
/**
|
|
4432
|
+
* Parse env var lines from hook stdout.
|
|
4433
|
+
*
|
|
4434
|
+
* Accepts:
|
|
4435
|
+
* export KEY="value" → { KEY: "value" }
|
|
4436
|
+
* export KEY=value → { KEY: "value" }
|
|
4437
|
+
* KEY=value → { KEY: "value" }
|
|
4438
|
+
*
|
|
4439
|
+
* Strips surrounding single or double quotes from values.
|
|
4440
|
+
* Skips lines with empty keys or values that look like shell syntax.
|
|
4441
|
+
*/
|
|
4442
|
+
declare function parseEnvOutput(stdout: string): Record<string, string>;
|
|
4443
|
+
/**
|
|
4444
|
+
* Run the before_session hook command and inject exported env vars into process.env.
|
|
4445
|
+
*
|
|
4446
|
+
* - Runs via shell (`sh -c` on POSIX, `cmd /c` on Windows)
|
|
4447
|
+
* - Captured stdout is parsed for env vars; stderr is forwarded to process.stderr
|
|
4448
|
+
* - Non-zero exit throws an Error with the command and exit code
|
|
4449
|
+
* - Keys already set in process.env are NOT overwritten
|
|
4450
|
+
*
|
|
4451
|
+
* @param command Shell command string to execute
|
|
4452
|
+
*/
|
|
4453
|
+
declare function runBeforeSessionHook(command: string): void;
|
|
4454
|
+
|
|
4349
4455
|
/**
|
|
4350
4456
|
* Core types for the transcript import pipeline.
|
|
4351
4457
|
*
|
|
@@ -4663,4 +4769,4 @@ type AgentKernel = {
|
|
|
4663
4769
|
};
|
|
4664
4770
|
declare function createAgentKernel(): AgentKernel;
|
|
4665
4771
|
|
|
4666
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
4772
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.js
CHANGED
|
@@ -118,6 +118,7 @@ import {
|
|
|
118
118
|
runIsJsonAssertion,
|
|
119
119
|
runRegexAssertion,
|
|
120
120
|
runStartsWithAssertion,
|
|
121
|
+
scoreRangeEvaluationSchema,
|
|
121
122
|
scoreToVerdict,
|
|
122
123
|
subscribeToClaudeLogEntries,
|
|
123
124
|
subscribeToCodexLogEntries,
|
|
@@ -128,7 +129,7 @@ import {
|
|
|
128
129
|
toCamelCaseDeep,
|
|
129
130
|
toSnakeCaseDeep,
|
|
130
131
|
tokensPerTool
|
|
131
|
-
} from "./chunk-
|
|
132
|
+
} from "./chunk-B3BLJRYI.js";
|
|
132
133
|
import {
|
|
133
134
|
COMMON_TARGET_SETTINGS,
|
|
134
135
|
TEST_MESSAGE_ROLES,
|
|
@@ -457,6 +458,17 @@ var AgentVConfigSchema = z.object({
|
|
|
457
458
|
maxCostUsd: z.number().min(0).optional(),
|
|
458
459
|
/** Maximum duration per run in milliseconds */
|
|
459
460
|
maxDurationMs: z.number().int().min(0).optional()
|
|
461
|
+
}).optional(),
|
|
462
|
+
/** Lifecycle hooks */
|
|
463
|
+
hooks: z.object({
|
|
464
|
+
/**
|
|
465
|
+
* Shell command to run once at agentv startup, before any command executes.
|
|
466
|
+
* stdout is parsed for env var exports (`KEY=value` or `export KEY="value"`)
|
|
467
|
+
* and injected into process.env. Keys already set in the environment are
|
|
468
|
+
* not overwritten — existing env always takes priority.
|
|
469
|
+
* stderr is forwarded to the user. Non-zero exit aborts with an error.
|
|
470
|
+
*/
|
|
471
|
+
beforeSession: z.string().optional()
|
|
460
472
|
}).optional()
|
|
461
473
|
});
|
|
462
474
|
function defineConfig(config) {
|
|
@@ -1709,6 +1721,62 @@ var RunBudgetTracker = class {
|
|
|
1709
1721
|
}
|
|
1710
1722
|
};
|
|
1711
1723
|
|
|
1724
|
+
// src/evaluation/hooks.ts
|
|
1725
|
+
import { spawnSync } from "node:child_process";
|
|
1726
|
+
var ANSI_YELLOW = "\x1B[33m";
|
|
1727
|
+
var ANSI_RESET = "\x1B[0m";
|
|
1728
|
+
function parseEnvOutput(stdout) {
|
|
1729
|
+
const result = {};
|
|
1730
|
+
for (const line of stdout.split("\n")) {
|
|
1731
|
+
const trimmed = line.trim();
|
|
1732
|
+
if (!trimmed || trimmed.startsWith("#")) continue;
|
|
1733
|
+
const match = trimmed.match(/^(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)=(.*)$/);
|
|
1734
|
+
if (!match) continue;
|
|
1735
|
+
const key = match[1];
|
|
1736
|
+
let value = match[2];
|
|
1737
|
+
if (value.startsWith('"') && value.endsWith('"') || value.startsWith("'") && value.endsWith("'")) {
|
|
1738
|
+
value = value.slice(1, -1);
|
|
1739
|
+
}
|
|
1740
|
+
if (key) {
|
|
1741
|
+
result[key] = value;
|
|
1742
|
+
}
|
|
1743
|
+
}
|
|
1744
|
+
return result;
|
|
1745
|
+
}
|
|
1746
|
+
function runBeforeSessionHook(command) {
|
|
1747
|
+
const isWindows = process.platform === "win32";
|
|
1748
|
+
const shell = isWindows ? "cmd" : "sh";
|
|
1749
|
+
const shellFlag = isWindows ? "/c" : "-c";
|
|
1750
|
+
console.log(`${ANSI_YELLOW}Running before_session hook: ${command}${ANSI_RESET}`);
|
|
1751
|
+
const result = spawnSync(shell, [shellFlag, command], {
|
|
1752
|
+
encoding: "utf8",
|
|
1753
|
+
// Do not inherit stdio — capture stdout for parsing, forward stderr manually
|
|
1754
|
+
stdio: ["ignore", "pipe", "pipe"]
|
|
1755
|
+
});
|
|
1756
|
+
if (result.stderr) {
|
|
1757
|
+
process.stderr.write(result.stderr);
|
|
1758
|
+
}
|
|
1759
|
+
if (result.error) {
|
|
1760
|
+
throw new Error(`before_session hook failed to start: ${result.error.message}`);
|
|
1761
|
+
}
|
|
1762
|
+
if (result.status !== 0) {
|
|
1763
|
+
throw new Error(
|
|
1764
|
+
`before_session hook exited with code ${result.status ?? "unknown"}: ${command}`
|
|
1765
|
+
);
|
|
1766
|
+
}
|
|
1767
|
+
const vars = parseEnvOutput(result.stdout ?? "");
|
|
1768
|
+
let injected = 0;
|
|
1769
|
+
for (const [key, value] of Object.entries(vars)) {
|
|
1770
|
+
if (process.env[key] === void 0) {
|
|
1771
|
+
process.env[key] = value;
|
|
1772
|
+
injected++;
|
|
1773
|
+
}
|
|
1774
|
+
}
|
|
1775
|
+
if (injected > 0) {
|
|
1776
|
+
console.log(`before_session hook injected ${injected} environment variable(s).`);
|
|
1777
|
+
}
|
|
1778
|
+
}
|
|
1779
|
+
|
|
1712
1780
|
// src/import/claude-parser.ts
|
|
1713
1781
|
var SKIPPED_TYPES = /* @__PURE__ */ new Set(["progress", "system", "file-history-snapshot"]);
|
|
1714
1782
|
function parseClaudeSession(jsonl) {
|
|
@@ -2476,6 +2544,7 @@ export {
|
|
|
2476
2544
|
parseClaudeSession,
|
|
2477
2545
|
parseCodexSession,
|
|
2478
2546
|
parseCopilotEvents,
|
|
2547
|
+
parseEnvOutput,
|
|
2479
2548
|
parseJsonFromText,
|
|
2480
2549
|
parseJsonSafe,
|
|
2481
2550
|
prepareResultsRepoBranch,
|
|
@@ -2495,6 +2564,7 @@ export {
|
|
|
2495
2564
|
resolveTargetDefinition,
|
|
2496
2565
|
resolveWorkspaceTemplate,
|
|
2497
2566
|
rubricEvaluationSchema,
|
|
2567
|
+
runBeforeSessionHook,
|
|
2498
2568
|
runContainsAllAssertion,
|
|
2499
2569
|
runContainsAnyAssertion,
|
|
2500
2570
|
runContainsAssertion,
|
|
@@ -2510,6 +2580,7 @@ export {
|
|
|
2510
2580
|
runStartsWithAssertion,
|
|
2511
2581
|
saveBenchmarkRegistry,
|
|
2512
2582
|
scanRepoDeps,
|
|
2583
|
+
scoreRangeEvaluationSchema,
|
|
2513
2584
|
scoreToVerdict,
|
|
2514
2585
|
shouldEnableCache,
|
|
2515
2586
|
shouldSkipCacheForTemperature,
|