@agentv/core 4.21.0 → 4.22.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1912,6 +1912,10 @@ type ResultsExportConfig = {
1912
1912
  readonly auto_push?: boolean;
1913
1913
  readonly branch_prefix?: string;
1914
1914
  };
1915
+ type HooksConfig = {
1916
+ /** Shell command to run once at agentv startup. stdout is parsed for env var exports. */
1917
+ readonly before_session?: string;
1918
+ };
1915
1919
  type AgentVConfig$1 = {
1916
1920
  readonly required_version?: string;
1917
1921
  readonly eval_patterns?: readonly string[];
@@ -1919,6 +1923,7 @@ type AgentVConfig$1 = {
1919
1923
  readonly results?: {
1920
1924
  readonly export?: ResultsExportConfig;
1921
1925
  };
1926
+ readonly hooks?: HooksConfig;
1922
1927
  };
1923
1928
  /**
1924
1929
  * Load optional .agentv/config.yaml configuration file.
@@ -2937,6 +2942,36 @@ declare const rubricEvaluationSchema: z.ZodObject<{
2937
2942
  }[];
2938
2943
  overall_reasoning: string;
2939
2944
  }>;
2945
+ declare const scoreRangeEvaluationSchema: z.ZodObject<{
2946
+ checks: z.ZodArray<z.ZodObject<{
2947
+ id: z.ZodString;
2948
+ score: z.ZodNumber;
2949
+ reasoning: z.ZodOptional<z.ZodString>;
2950
+ }, "strip", z.ZodTypeAny, {
2951
+ id: string;
2952
+ score: number;
2953
+ reasoning?: string | undefined;
2954
+ }, {
2955
+ id: string;
2956
+ score: number;
2957
+ reasoning?: string | undefined;
2958
+ }>, "many">;
2959
+ overall_reasoning: z.ZodOptional<z.ZodString>;
2960
+ }, "strip", z.ZodTypeAny, {
2961
+ checks: {
2962
+ id: string;
2963
+ score: number;
2964
+ reasoning?: string | undefined;
2965
+ }[];
2966
+ overall_reasoning?: string | undefined;
2967
+ }, {
2968
+ checks: {
2969
+ id: string;
2970
+ score: number;
2971
+ reasoning?: string | undefined;
2972
+ }[];
2973
+ overall_reasoning?: string | undefined;
2974
+ }>;
2940
2975
 
2941
2976
  declare class LlmGrader implements Grader {
2942
2977
  readonly kind = "llm-grader";
@@ -3537,6 +3572,21 @@ declare const AgentVConfigSchema: z.ZodObject<{
3537
3572
  maxDurationMs?: number | undefined;
3538
3573
  maxCostUsd?: number | undefined;
3539
3574
  }>>;
3575
+ /** Lifecycle hooks */
3576
+ hooks: z.ZodOptional<z.ZodObject<{
3577
+ /**
3578
+ * Shell command to run once at agentv startup, before any command executes.
3579
+ * stdout is parsed for env var exports (`KEY=value` or `export KEY="value"`)
3580
+ * and injected into process.env. Keys already set in the environment are
3581
+ * not overwritten — existing env always takes priority.
3582
+ * stderr is forwarded to the user. Non-zero exit aborts with an error.
3583
+ */
3584
+ beforeSession: z.ZodOptional<z.ZodString>;
3585
+ }, "strip", z.ZodTypeAny, {
3586
+ beforeSession?: string | undefined;
3587
+ }, {
3588
+ beforeSession?: string | undefined;
3589
+ }>>;
3540
3590
  }, "strip", z.ZodTypeAny, {
3541
3591
  execution?: {
3542
3592
  workers?: number | undefined;
@@ -3546,6 +3596,9 @@ declare const AgentVConfigSchema: z.ZodObject<{
3546
3596
  keepWorkspaces?: boolean | undefined;
3547
3597
  otelFile?: string | undefined;
3548
3598
  } | undefined;
3599
+ hooks?: {
3600
+ beforeSession?: string | undefined;
3601
+ } | undefined;
3549
3602
  cache?: {
3550
3603
  enabled?: boolean | undefined;
3551
3604
  path?: string | undefined;
@@ -3567,6 +3620,9 @@ declare const AgentVConfigSchema: z.ZodObject<{
3567
3620
  keepWorkspaces?: boolean | undefined;
3568
3621
  otelFile?: string | undefined;
3569
3622
  } | undefined;
3623
+ hooks?: {
3624
+ beforeSession?: string | undefined;
3625
+ } | undefined;
3570
3626
  cache?: {
3571
3627
  enabled?: boolean | undefined;
3572
3628
  path?: string | undefined;
@@ -4346,6 +4402,56 @@ declare function discoverAssertions(registry: GraderRegistry, baseDir: string):
4346
4402
  */
4347
4403
  declare function discoverGraders(registry: GraderRegistry, baseDir: string): Promise<string[]>;
4348
4404
 
4405
+ /**
4406
+ * Session hook execution for AgentV.
4407
+ *
4408
+ * Runs a shell command once at agentv startup and injects exported environment
4409
+ * variables into the current process. This lets projects fetch secrets at
4410
+ * runtime (e.g. from a vault) without needing a wrapper script.
4411
+ *
4412
+ * ## How it works
4413
+ *
4414
+ * 1. The command is run via `sh -c` (or `cmd /c` on Windows).
4415
+ * 2. stdout is captured and parsed for env var exports.
4416
+ * 3. stderr is forwarded to the process stderr so the user sees output.
4417
+ * 4. Non-zero exit aborts with a clear error.
4418
+ * 5. Parsed keys are injected into `process.env` — only for keys not already
4419
+ * set, so existing env always wins.
4420
+ *
4421
+ * ## Supported output formats
4422
+ *
4423
+ * Both shell-export and dotenv formats are accepted:
4424
+ * export KEY="value" (shell export — quotes optional)
4425
+ * KEY=value (dotenv — no export prefix)
4426
+ *
4427
+ * Lines that don't match either pattern are silently ignored.
4428
+ *
4429
+ * @module
4430
+ */
4431
+ /**
4432
+ * Parse env var lines from hook stdout.
4433
+ *
4434
+ * Accepts:
4435
+ * export KEY="value" → { KEY: "value" }
4436
+ * export KEY=value → { KEY: "value" }
4437
+ * KEY=value → { KEY: "value" }
4438
+ *
4439
+ * Strips surrounding single or double quotes from values.
4440
+ * Skips lines with empty keys or values that look like shell syntax.
4441
+ */
4442
+ declare function parseEnvOutput(stdout: string): Record<string, string>;
4443
+ /**
4444
+ * Run the before_session hook command and inject exported env vars into process.env.
4445
+ *
4446
+ * - Runs via shell (`sh -c` on POSIX, `cmd /c` on Windows)
4447
+ * - Captured stdout is parsed for env vars; stderr is forwarded to process.stderr
4448
+ * - Non-zero exit throws an Error with the command and exit code
4449
+ * - Keys already set in process.env are NOT overwritten
4450
+ *
4451
+ * @param command Shell command string to execute
4452
+ */
4453
+ declare function runBeforeSessionHook(command: string): void;
4454
+
4349
4455
  /**
4350
4456
  * Core types for the transcript import pipeline.
4351
4457
  *
@@ -4663,4 +4769,4 @@ type AgentKernel = {
4663
4769
  };
4664
4770
  declare function createAgentKernel(): AgentKernel;
4665
4771
 
4666
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4772
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
package/dist/index.d.ts CHANGED
@@ -1912,6 +1912,10 @@ type ResultsExportConfig = {
1912
1912
  readonly auto_push?: boolean;
1913
1913
  readonly branch_prefix?: string;
1914
1914
  };
1915
+ type HooksConfig = {
1916
+ /** Shell command to run once at agentv startup. stdout is parsed for env var exports. */
1917
+ readonly before_session?: string;
1918
+ };
1915
1919
  type AgentVConfig$1 = {
1916
1920
  readonly required_version?: string;
1917
1921
  readonly eval_patterns?: readonly string[];
@@ -1919,6 +1923,7 @@ type AgentVConfig$1 = {
1919
1923
  readonly results?: {
1920
1924
  readonly export?: ResultsExportConfig;
1921
1925
  };
1926
+ readonly hooks?: HooksConfig;
1922
1927
  };
1923
1928
  /**
1924
1929
  * Load optional .agentv/config.yaml configuration file.
@@ -2937,6 +2942,36 @@ declare const rubricEvaluationSchema: z.ZodObject<{
2937
2942
  }[];
2938
2943
  overall_reasoning: string;
2939
2944
  }>;
2945
+ declare const scoreRangeEvaluationSchema: z.ZodObject<{
2946
+ checks: z.ZodArray<z.ZodObject<{
2947
+ id: z.ZodString;
2948
+ score: z.ZodNumber;
2949
+ reasoning: z.ZodOptional<z.ZodString>;
2950
+ }, "strip", z.ZodTypeAny, {
2951
+ id: string;
2952
+ score: number;
2953
+ reasoning?: string | undefined;
2954
+ }, {
2955
+ id: string;
2956
+ score: number;
2957
+ reasoning?: string | undefined;
2958
+ }>, "many">;
2959
+ overall_reasoning: z.ZodOptional<z.ZodString>;
2960
+ }, "strip", z.ZodTypeAny, {
2961
+ checks: {
2962
+ id: string;
2963
+ score: number;
2964
+ reasoning?: string | undefined;
2965
+ }[];
2966
+ overall_reasoning?: string | undefined;
2967
+ }, {
2968
+ checks: {
2969
+ id: string;
2970
+ score: number;
2971
+ reasoning?: string | undefined;
2972
+ }[];
2973
+ overall_reasoning?: string | undefined;
2974
+ }>;
2940
2975
 
2941
2976
  declare class LlmGrader implements Grader {
2942
2977
  readonly kind = "llm-grader";
@@ -3537,6 +3572,21 @@ declare const AgentVConfigSchema: z.ZodObject<{
3537
3572
  maxDurationMs?: number | undefined;
3538
3573
  maxCostUsd?: number | undefined;
3539
3574
  }>>;
3575
+ /** Lifecycle hooks */
3576
+ hooks: z.ZodOptional<z.ZodObject<{
3577
+ /**
3578
+ * Shell command to run once at agentv startup, before any command executes.
3579
+ * stdout is parsed for env var exports (`KEY=value` or `export KEY="value"`)
3580
+ * and injected into process.env. Keys already set in the environment are
3581
+ * not overwritten — existing env always takes priority.
3582
+ * stderr is forwarded to the user. Non-zero exit aborts with an error.
3583
+ */
3584
+ beforeSession: z.ZodOptional<z.ZodString>;
3585
+ }, "strip", z.ZodTypeAny, {
3586
+ beforeSession?: string | undefined;
3587
+ }, {
3588
+ beforeSession?: string | undefined;
3589
+ }>>;
3540
3590
  }, "strip", z.ZodTypeAny, {
3541
3591
  execution?: {
3542
3592
  workers?: number | undefined;
@@ -3546,6 +3596,9 @@ declare const AgentVConfigSchema: z.ZodObject<{
3546
3596
  keepWorkspaces?: boolean | undefined;
3547
3597
  otelFile?: string | undefined;
3548
3598
  } | undefined;
3599
+ hooks?: {
3600
+ beforeSession?: string | undefined;
3601
+ } | undefined;
3549
3602
  cache?: {
3550
3603
  enabled?: boolean | undefined;
3551
3604
  path?: string | undefined;
@@ -3567,6 +3620,9 @@ declare const AgentVConfigSchema: z.ZodObject<{
3567
3620
  keepWorkspaces?: boolean | undefined;
3568
3621
  otelFile?: string | undefined;
3569
3622
  } | undefined;
3623
+ hooks?: {
3624
+ beforeSession?: string | undefined;
3625
+ } | undefined;
3570
3626
  cache?: {
3571
3627
  enabled?: boolean | undefined;
3572
3628
  path?: string | undefined;
@@ -4346,6 +4402,56 @@ declare function discoverAssertions(registry: GraderRegistry, baseDir: string):
4346
4402
  */
4347
4403
  declare function discoverGraders(registry: GraderRegistry, baseDir: string): Promise<string[]>;
4348
4404
 
4405
+ /**
4406
+ * Session hook execution for AgentV.
4407
+ *
4408
+ * Runs a shell command once at agentv startup and injects exported environment
4409
+ * variables into the current process. This lets projects fetch secrets at
4410
+ * runtime (e.g. from a vault) without needing a wrapper script.
4411
+ *
4412
+ * ## How it works
4413
+ *
4414
+ * 1. The command is run via `sh -c` (or `cmd /c` on Windows).
4415
+ * 2. stdout is captured and parsed for env var exports.
4416
+ * 3. stderr is forwarded to the process stderr so the user sees output.
4417
+ * 4. Non-zero exit aborts with a clear error.
4418
+ * 5. Parsed keys are injected into `process.env` — only for keys not already
4419
+ * set, so existing env always wins.
4420
+ *
4421
+ * ## Supported output formats
4422
+ *
4423
+ * Both shell-export and dotenv formats are accepted:
4424
+ * export KEY="value" (shell export — quotes optional)
4425
+ * KEY=value (dotenv — no export prefix)
4426
+ *
4427
+ * Lines that don't match either pattern are silently ignored.
4428
+ *
4429
+ * @module
4430
+ */
4431
+ /**
4432
+ * Parse env var lines from hook stdout.
4433
+ *
4434
+ * Accepts:
4435
+ * export KEY="value" → { KEY: "value" }
4436
+ * export KEY=value → { KEY: "value" }
4437
+ * KEY=value → { KEY: "value" }
4438
+ *
4439
+ * Strips surrounding single or double quotes from values.
4440
+ * Skips lines with empty keys or values that look like shell syntax.
4441
+ */
4442
+ declare function parseEnvOutput(stdout: string): Record<string, string>;
4443
+ /**
4444
+ * Run the before_session hook command and inject exported env vars into process.env.
4445
+ *
4446
+ * - Runs via shell (`sh -c` on POSIX, `cmd /c` on Windows)
4447
+ * - Captured stdout is parsed for env vars; stderr is forwarded to process.stderr
4448
+ * - Non-zero exit throws an Error with the command and exit code
4449
+ * - Keys already set in process.env are NOT overwritten
4450
+ *
4451
+ * @param command Shell command string to execute
4452
+ */
4453
+ declare function runBeforeSessionHook(command: string): void;
4454
+
4349
4455
  /**
4350
4456
  * Core types for the transcript import pipeline.
4351
4457
  *
@@ -4663,4 +4769,4 @@ type AgentKernel = {
4663
4769
  };
4664
4770
  declare function createAgentKernel(): AgentKernel;
4665
4771
 
4666
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4772
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, RunBudgetTracker, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, formatToolCalls, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseEnvOutput, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runBeforeSessionHook, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreRangeEvaluationSchema, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
package/dist/index.js CHANGED
@@ -118,6 +118,7 @@ import {
118
118
  runIsJsonAssertion,
119
119
  runRegexAssertion,
120
120
  runStartsWithAssertion,
121
+ scoreRangeEvaluationSchema,
121
122
  scoreToVerdict,
122
123
  subscribeToClaudeLogEntries,
123
124
  subscribeToCodexLogEntries,
@@ -128,7 +129,7 @@ import {
128
129
  toCamelCaseDeep,
129
130
  toSnakeCaseDeep,
130
131
  tokensPerTool
131
- } from "./chunk-WCW3V6QJ.js";
132
+ } from "./chunk-B3BLJRYI.js";
132
133
  import {
133
134
  COMMON_TARGET_SETTINGS,
134
135
  TEST_MESSAGE_ROLES,
@@ -457,6 +458,17 @@ var AgentVConfigSchema = z.object({
457
458
  maxCostUsd: z.number().min(0).optional(),
458
459
  /** Maximum duration per run in milliseconds */
459
460
  maxDurationMs: z.number().int().min(0).optional()
461
+ }).optional(),
462
+ /** Lifecycle hooks */
463
+ hooks: z.object({
464
+ /**
465
+ * Shell command to run once at agentv startup, before any command executes.
466
+ * stdout is parsed for env var exports (`KEY=value` or `export KEY="value"`)
467
+ * and injected into process.env. Keys already set in the environment are
468
+ * not overwritten — existing env always takes priority.
469
+ * stderr is forwarded to the user. Non-zero exit aborts with an error.
470
+ */
471
+ beforeSession: z.string().optional()
460
472
  }).optional()
461
473
  });
462
474
  function defineConfig(config) {
@@ -1709,6 +1721,62 @@ var RunBudgetTracker = class {
1709
1721
  }
1710
1722
  };
1711
1723
 
1724
+ // src/evaluation/hooks.ts
1725
+ import { spawnSync } from "node:child_process";
1726
+ var ANSI_YELLOW = "\x1B[33m";
1727
+ var ANSI_RESET = "\x1B[0m";
1728
+ function parseEnvOutput(stdout) {
1729
+ const result = {};
1730
+ for (const line of stdout.split("\n")) {
1731
+ const trimmed = line.trim();
1732
+ if (!trimmed || trimmed.startsWith("#")) continue;
1733
+ const match = trimmed.match(/^(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)=(.*)$/);
1734
+ if (!match) continue;
1735
+ const key = match[1];
1736
+ let value = match[2];
1737
+ if (value.startsWith('"') && value.endsWith('"') || value.startsWith("'") && value.endsWith("'")) {
1738
+ value = value.slice(1, -1);
1739
+ }
1740
+ if (key) {
1741
+ result[key] = value;
1742
+ }
1743
+ }
1744
+ return result;
1745
+ }
1746
+ function runBeforeSessionHook(command) {
1747
+ const isWindows = process.platform === "win32";
1748
+ const shell = isWindows ? "cmd" : "sh";
1749
+ const shellFlag = isWindows ? "/c" : "-c";
1750
+ console.log(`${ANSI_YELLOW}Running before_session hook: ${command}${ANSI_RESET}`);
1751
+ const result = spawnSync(shell, [shellFlag, command], {
1752
+ encoding: "utf8",
1753
+ // Do not inherit stdio — capture stdout for parsing, forward stderr manually
1754
+ stdio: ["ignore", "pipe", "pipe"]
1755
+ });
1756
+ if (result.stderr) {
1757
+ process.stderr.write(result.stderr);
1758
+ }
1759
+ if (result.error) {
1760
+ throw new Error(`before_session hook failed to start: ${result.error.message}`);
1761
+ }
1762
+ if (result.status !== 0) {
1763
+ throw new Error(
1764
+ `before_session hook exited with code ${result.status ?? "unknown"}: ${command}`
1765
+ );
1766
+ }
1767
+ const vars = parseEnvOutput(result.stdout ?? "");
1768
+ let injected = 0;
1769
+ for (const [key, value] of Object.entries(vars)) {
1770
+ if (process.env[key] === void 0) {
1771
+ process.env[key] = value;
1772
+ injected++;
1773
+ }
1774
+ }
1775
+ if (injected > 0) {
1776
+ console.log(`before_session hook injected ${injected} environment variable(s).`);
1777
+ }
1778
+ }
1779
+
1712
1780
  // src/import/claude-parser.ts
1713
1781
  var SKIPPED_TYPES = /* @__PURE__ */ new Set(["progress", "system", "file-history-snapshot"]);
1714
1782
  function parseClaudeSession(jsonl) {
@@ -2476,6 +2544,7 @@ export {
2476
2544
  parseClaudeSession,
2477
2545
  parseCodexSession,
2478
2546
  parseCopilotEvents,
2547
+ parseEnvOutput,
2479
2548
  parseJsonFromText,
2480
2549
  parseJsonSafe,
2481
2550
  prepareResultsRepoBranch,
@@ -2495,6 +2564,7 @@ export {
2495
2564
  resolveTargetDefinition,
2496
2565
  resolveWorkspaceTemplate,
2497
2566
  rubricEvaluationSchema,
2567
+ runBeforeSessionHook,
2498
2568
  runContainsAllAssertion,
2499
2569
  runContainsAnyAssertion,
2500
2570
  runContainsAssertion,
@@ -2510,6 +2580,7 @@ export {
2510
2580
  runStartsWithAssertion,
2511
2581
  saveBenchmarkRegistry,
2512
2582
  scanRepoDeps,
2583
+ scoreRangeEvaluationSchema,
2513
2584
  scoreToVerdict,
2514
2585
  shouldEnableCache,
2515
2586
  shouldSkipCacheForTemperature,