@agentv/core 3.8.0 → 3.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -23,8 +23,6 @@ interface ProviderStreamCallbacks {
23
23
  interface ProviderRequest {
24
24
  readonly question: string;
25
25
  readonly systemPrompt?: string;
26
- readonly guidelines?: string;
27
- readonly guideline_patterns?: readonly string[];
28
26
  readonly chatPrompt?: ChatPrompt;
29
27
  readonly inputFiles?: readonly string[];
30
28
  readonly evalCaseId?: string;
@@ -1016,15 +1014,13 @@ type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | Composit
1016
1014
  */
1017
1015
  interface EvalTest {
1018
1016
  readonly id: string;
1019
- readonly dataset?: string;
1017
+ readonly eval_set?: string;
1020
1018
  readonly conversation_id?: string;
1021
1019
  readonly question: string;
1022
1020
  readonly input: readonly TestMessage[];
1023
1021
  readonly input_segments: readonly JsonObject[];
1024
1022
  readonly expected_output: readonly JsonObject[];
1025
1023
  readonly reference_answer?: string;
1026
- readonly guideline_paths: readonly string[];
1027
- readonly guideline_patterns?: readonly string[];
1028
1024
  readonly file_paths: readonly string[];
1029
1025
  readonly criteria: string;
1030
1026
  readonly evaluator?: EvaluatorKind;
@@ -1128,7 +1124,7 @@ type FailOnError = boolean;
1128
1124
  interface EvaluationResult {
1129
1125
  readonly timestamp: string;
1130
1126
  readonly testId: string;
1131
- readonly dataset?: string;
1127
+ readonly eval_set?: string;
1132
1128
  readonly conversationId?: string;
1133
1129
  readonly score: number;
1134
1130
  readonly assertions: readonly AssertionEntry[];
@@ -1261,7 +1257,6 @@ type ExecutionDefaults = {
1261
1257
  };
1262
1258
  type AgentVConfig$1 = {
1263
1259
  readonly required_version?: string;
1264
- readonly guideline_patterns?: readonly string[];
1265
1260
  readonly eval_patterns?: readonly string[];
1266
1261
  readonly execution?: ExecutionDefaults;
1267
1262
  };
@@ -1270,10 +1265,6 @@ type AgentVConfig$1 = {
1270
1265
  * Searches from eval file directory up to repo root.
1271
1266
  */
1272
1267
  declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
1273
- /**
1274
- * Determine whether a path references guideline content (instructions or prompts).
1275
- */
1276
- declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
1277
1268
  /**
1278
1269
  * Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
1279
1270
  */
@@ -1284,6 +1275,10 @@ declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
1284
1275
  * Returns undefined when no targets array is specified.
1285
1276
  */
1286
1277
  declare function extractTargetsFromSuite(suite: JsonObject): readonly string[] | undefined;
1278
+ /**
1279
+ * Extract workers count from suite-level execution block.
1280
+ */
1281
+ declare function extractWorkersFromSuite(suite: JsonObject): number | undefined;
1287
1282
  /**
1288
1283
  * Extract per-test targets array from a raw test case object.
1289
1284
  */
@@ -1320,16 +1315,15 @@ declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
1320
1315
  type FormattingMode = 'agent' | 'lm';
1321
1316
 
1322
1317
  /**
1323
- * Build prompt inputs by consolidating user request context and guideline content.
1318
+ * Build prompt inputs by consolidating user request context.
1324
1319
  */
1325
1320
  interface PromptInputs {
1326
1321
  readonly question: string;
1327
- readonly guidelines: string;
1328
1322
  readonly chatPrompt?: ChatPrompt;
1329
1323
  readonly systemMessage?: string;
1330
1324
  }
1331
1325
  /**
1332
- * Build prompt inputs by consolidating user request context and guideline content.
1326
+ * Build prompt inputs by consolidating user request context.
1333
1327
  *
1334
1328
  * @param testCase - The evaluation test case
1335
1329
  * @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
@@ -1364,6 +1358,8 @@ type EvalSuiteResult = {
1364
1358
  readonly trials?: TrialsConfig;
1365
1359
  /** Suite-level targets from execution.targets (matrix evaluation) */
1366
1360
  readonly targets?: readonly string[];
1361
+ /** Suite-level workers from execution.workers */
1362
+ readonly workers?: number;
1367
1363
  /** Suite-level cache config from execution.cache */
1368
1364
  readonly cacheConfig?: CacheConfig;
1369
1365
  /** Suite-level metadata (name, description, version, etc.) */
@@ -2018,7 +2014,6 @@ interface EvaluationContext {
2018
2014
  readonly attempt: number;
2019
2015
  readonly promptInputs: {
2020
2016
  readonly question: string;
2021
- readonly guidelines: string;
2022
2017
  readonly systemMessage?: string;
2023
2018
  readonly chatPrompt?: ChatPrompt;
2024
2019
  };
@@ -3018,16 +3013,16 @@ declare const AgentVConfigSchema: z.ZodObject<{
3018
3013
  /** Write OTLP JSON trace to this path (supports {timestamp} placeholder) */
3019
3014
  otelFile: z.ZodOptional<z.ZodString>;
3020
3015
  }, "strip", z.ZodTypeAny, {
3021
- verbose?: boolean | undefined;
3022
3016
  workers?: number | undefined;
3017
+ verbose?: boolean | undefined;
3023
3018
  maxRetries?: number | undefined;
3024
3019
  agentTimeoutMs?: number | undefined;
3025
3020
  keepWorkspaces?: boolean | undefined;
3026
3021
  traceFile?: string | undefined;
3027
3022
  otelFile?: string | undefined;
3028
3023
  }, {
3029
- verbose?: boolean | undefined;
3030
3024
  workers?: number | undefined;
3025
+ verbose?: boolean | undefined;
3031
3026
  maxRetries?: number | undefined;
3032
3027
  agentTimeoutMs?: number | undefined;
3033
3028
  keepWorkspaces?: boolean | undefined;
@@ -3075,8 +3070,8 @@ declare const AgentVConfigSchema: z.ZodObject<{
3075
3070
  }>>;
3076
3071
  }, "strip", z.ZodTypeAny, {
3077
3072
  execution?: {
3078
- verbose?: boolean | undefined;
3079
3073
  workers?: number | undefined;
3074
+ verbose?: boolean | undefined;
3080
3075
  maxRetries?: number | undefined;
3081
3076
  agentTimeoutMs?: number | undefined;
3082
3077
  keepWorkspaces?: boolean | undefined;
@@ -3097,8 +3092,8 @@ declare const AgentVConfigSchema: z.ZodObject<{
3097
3092
  } | undefined;
3098
3093
  }, {
3099
3094
  execution?: {
3100
- verbose?: boolean | undefined;
3101
3095
  workers?: number | undefined;
3096
+ verbose?: boolean | undefined;
3102
3097
  maxRetries?: number | undefined;
3103
3098
  agentTimeoutMs?: number | undefined;
3104
3099
  keepWorkspaces?: boolean | undefined;
@@ -3499,7 +3494,7 @@ declare class OtelStreamingObserver {
3499
3494
  private rootCtx;
3500
3495
  constructor(tracer: Tracer, api: OtelApi, captureContent: boolean, parentCtx?: any | undefined);
3501
3496
  /** Create root eval span immediately (visible in backend right away) */
3502
- startEvalCase(testId: string, target: string, dataset?: string): void;
3497
+ startEvalCase(testId: string, target: string, evalSet?: string): void;
3503
3498
  /** Create and immediately export a tool span */
3504
3499
  onToolCall(name: string, input: unknown, output: unknown, _durationMs: number, toolCallId?: string): void;
3505
3500
  /** Create and immediately export an LLM span */
@@ -3612,4 +3607,4 @@ type AgentKernel = {
3612
3607
  };
3613
3608
  declare function createAgentKernel(): AgentKernel;
3614
3609
 
3615
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
3610
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
package/dist/index.d.ts CHANGED
@@ -23,8 +23,6 @@ interface ProviderStreamCallbacks {
23
23
  interface ProviderRequest {
24
24
  readonly question: string;
25
25
  readonly systemPrompt?: string;
26
- readonly guidelines?: string;
27
- readonly guideline_patterns?: readonly string[];
28
26
  readonly chatPrompt?: ChatPrompt;
29
27
  readonly inputFiles?: readonly string[];
30
28
  readonly evalCaseId?: string;
@@ -1016,15 +1014,13 @@ type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | Composit
1016
1014
  */
1017
1015
  interface EvalTest {
1018
1016
  readonly id: string;
1019
- readonly dataset?: string;
1017
+ readonly eval_set?: string;
1020
1018
  readonly conversation_id?: string;
1021
1019
  readonly question: string;
1022
1020
  readonly input: readonly TestMessage[];
1023
1021
  readonly input_segments: readonly JsonObject[];
1024
1022
  readonly expected_output: readonly JsonObject[];
1025
1023
  readonly reference_answer?: string;
1026
- readonly guideline_paths: readonly string[];
1027
- readonly guideline_patterns?: readonly string[];
1028
1024
  readonly file_paths: readonly string[];
1029
1025
  readonly criteria: string;
1030
1026
  readonly evaluator?: EvaluatorKind;
@@ -1128,7 +1124,7 @@ type FailOnError = boolean;
1128
1124
  interface EvaluationResult {
1129
1125
  readonly timestamp: string;
1130
1126
  readonly testId: string;
1131
- readonly dataset?: string;
1127
+ readonly eval_set?: string;
1132
1128
  readonly conversationId?: string;
1133
1129
  readonly score: number;
1134
1130
  readonly assertions: readonly AssertionEntry[];
@@ -1261,7 +1257,6 @@ type ExecutionDefaults = {
1261
1257
  };
1262
1258
  type AgentVConfig$1 = {
1263
1259
  readonly required_version?: string;
1264
- readonly guideline_patterns?: readonly string[];
1265
1260
  readonly eval_patterns?: readonly string[];
1266
1261
  readonly execution?: ExecutionDefaults;
1267
1262
  };
@@ -1270,10 +1265,6 @@ type AgentVConfig$1 = {
1270
1265
  * Searches from eval file directory up to repo root.
1271
1266
  */
1272
1267
  declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
1273
- /**
1274
- * Determine whether a path references guideline content (instructions or prompts).
1275
- */
1276
- declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
1277
1268
  /**
1278
1269
  * Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
1279
1270
  */
@@ -1284,6 +1275,10 @@ declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
1284
1275
  * Returns undefined when no targets array is specified.
1285
1276
  */
1286
1277
  declare function extractTargetsFromSuite(suite: JsonObject): readonly string[] | undefined;
1278
+ /**
1279
+ * Extract workers count from suite-level execution block.
1280
+ */
1281
+ declare function extractWorkersFromSuite(suite: JsonObject): number | undefined;
1287
1282
  /**
1288
1283
  * Extract per-test targets array from a raw test case object.
1289
1284
  */
@@ -1320,16 +1315,15 @@ declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
1320
1315
  type FormattingMode = 'agent' | 'lm';
1321
1316
 
1322
1317
  /**
1323
- * Build prompt inputs by consolidating user request context and guideline content.
1318
+ * Build prompt inputs by consolidating user request context.
1324
1319
  */
1325
1320
  interface PromptInputs {
1326
1321
  readonly question: string;
1327
- readonly guidelines: string;
1328
1322
  readonly chatPrompt?: ChatPrompt;
1329
1323
  readonly systemMessage?: string;
1330
1324
  }
1331
1325
  /**
1332
- * Build prompt inputs by consolidating user request context and guideline content.
1326
+ * Build prompt inputs by consolidating user request context.
1333
1327
  *
1334
1328
  * @param testCase - The evaluation test case
1335
1329
  * @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
@@ -1364,6 +1358,8 @@ type EvalSuiteResult = {
1364
1358
  readonly trials?: TrialsConfig;
1365
1359
  /** Suite-level targets from execution.targets (matrix evaluation) */
1366
1360
  readonly targets?: readonly string[];
1361
+ /** Suite-level workers from execution.workers */
1362
+ readonly workers?: number;
1367
1363
  /** Suite-level cache config from execution.cache */
1368
1364
  readonly cacheConfig?: CacheConfig;
1369
1365
  /** Suite-level metadata (name, description, version, etc.) */
@@ -2018,7 +2014,6 @@ interface EvaluationContext {
2018
2014
  readonly attempt: number;
2019
2015
  readonly promptInputs: {
2020
2016
  readonly question: string;
2021
- readonly guidelines: string;
2022
2017
  readonly systemMessage?: string;
2023
2018
  readonly chatPrompt?: ChatPrompt;
2024
2019
  };
@@ -3018,16 +3013,16 @@ declare const AgentVConfigSchema: z.ZodObject<{
3018
3013
  /** Write OTLP JSON trace to this path (supports {timestamp} placeholder) */
3019
3014
  otelFile: z.ZodOptional<z.ZodString>;
3020
3015
  }, "strip", z.ZodTypeAny, {
3021
- verbose?: boolean | undefined;
3022
3016
  workers?: number | undefined;
3017
+ verbose?: boolean | undefined;
3023
3018
  maxRetries?: number | undefined;
3024
3019
  agentTimeoutMs?: number | undefined;
3025
3020
  keepWorkspaces?: boolean | undefined;
3026
3021
  traceFile?: string | undefined;
3027
3022
  otelFile?: string | undefined;
3028
3023
  }, {
3029
- verbose?: boolean | undefined;
3030
3024
  workers?: number | undefined;
3025
+ verbose?: boolean | undefined;
3031
3026
  maxRetries?: number | undefined;
3032
3027
  agentTimeoutMs?: number | undefined;
3033
3028
  keepWorkspaces?: boolean | undefined;
@@ -3075,8 +3070,8 @@ declare const AgentVConfigSchema: z.ZodObject<{
3075
3070
  }>>;
3076
3071
  }, "strip", z.ZodTypeAny, {
3077
3072
  execution?: {
3078
- verbose?: boolean | undefined;
3079
3073
  workers?: number | undefined;
3074
+ verbose?: boolean | undefined;
3080
3075
  maxRetries?: number | undefined;
3081
3076
  agentTimeoutMs?: number | undefined;
3082
3077
  keepWorkspaces?: boolean | undefined;
@@ -3097,8 +3092,8 @@ declare const AgentVConfigSchema: z.ZodObject<{
3097
3092
  } | undefined;
3098
3093
  }, {
3099
3094
  execution?: {
3100
- verbose?: boolean | undefined;
3101
3095
  workers?: number | undefined;
3096
+ verbose?: boolean | undefined;
3102
3097
  maxRetries?: number | undefined;
3103
3098
  agentTimeoutMs?: number | undefined;
3104
3099
  keepWorkspaces?: boolean | undefined;
@@ -3499,7 +3494,7 @@ declare class OtelStreamingObserver {
3499
3494
  private rootCtx;
3500
3495
  constructor(tracer: Tracer, api: OtelApi, captureContent: boolean, parentCtx?: any | undefined);
3501
3496
  /** Create root eval span immediately (visible in backend right away) */
3502
- startEvalCase(testId: string, target: string, dataset?: string): void;
3497
+ startEvalCase(testId: string, target: string, evalSet?: string): void;
3503
3498
  /** Create and immediately export a tool span */
3504
3499
  onToolCall(name: string, input: unknown, output: unknown, _durationMs: number, toolCallId?: string): void;
3505
3500
  /** Create and immediately export an LLM span */
@@ -3612,4 +3607,4 @@ type AgentKernel = {
3612
3607
  };
3613
3608
  declare function createAgentKernel(): AgentKernel;
3614
3609
 
3615
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
3610
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };