@agentv/core 4.15.1-next.1 → 4.15.2-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +120 -41
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +48 -14
- package/dist/index.d.ts +48 -14
- package/dist/index.js +118 -40
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -4193,11 +4193,11 @@ declare function discoverGraders(registry: EvaluatorRegistry, baseDir: string):
|
|
|
4193
4193
|
*
|
|
4194
4194
|
* Flow:
|
|
4195
4195
|
* raw session JSONL → parser → TranscriptEntry (internal)
|
|
4196
|
-
* TranscriptEntry →
|
|
4196
|
+
* TranscriptEntry → toTranscriptJsonLines() → JSONL on disk
|
|
4197
4197
|
* JSONL on disk → readTranscriptJsonl() → TranscriptJsonLine[]
|
|
4198
4198
|
*
|
|
4199
4199
|
* To add a new importer: write a parser that returns TranscriptEntry,
|
|
4200
|
-
* then use
|
|
4200
|
+
* then use toTranscriptJsonLines() to serialize.
|
|
4201
4201
|
*/
|
|
4202
4202
|
|
|
4203
4203
|
/**
|
|
@@ -4226,20 +4226,35 @@ interface TranscriptSource {
|
|
|
4226
4226
|
/**
|
|
4227
4227
|
* One line in a transcript JSONL file (snake_case wire format).
|
|
4228
4228
|
*
|
|
4229
|
-
* Each line
|
|
4230
|
-
*
|
|
4231
|
-
* full conversation (Message[]).
|
|
4229
|
+
* Each line captures one message within an ordered per-test transcript.
|
|
4230
|
+
* Consumers group all rows with the same `test_id` into a replayable session.
|
|
4232
4231
|
*/
|
|
4233
4232
|
interface TranscriptJsonLine {
|
|
4234
|
-
readonly
|
|
4235
|
-
readonly
|
|
4233
|
+
readonly test_id: string;
|
|
4234
|
+
readonly target: string;
|
|
4235
|
+
readonly message_index: number;
|
|
4236
|
+
readonly role: string;
|
|
4237
|
+
readonly name?: string;
|
|
4238
|
+
readonly content?: unknown;
|
|
4239
|
+
readonly tool_calls?: readonly Record<string, unknown>[];
|
|
4240
|
+
readonly start_time?: string;
|
|
4241
|
+
readonly end_time?: string;
|
|
4242
|
+
readonly duration_ms?: number;
|
|
4243
|
+
readonly metadata?: Record<string, unknown>;
|
|
4236
4244
|
readonly token_usage?: {
|
|
4237
4245
|
readonly input: number;
|
|
4238
4246
|
readonly output: number;
|
|
4239
4247
|
readonly cached?: number;
|
|
4248
|
+
readonly reasoning?: number;
|
|
4240
4249
|
};
|
|
4241
|
-
readonly
|
|
4242
|
-
|
|
4250
|
+
readonly transcript_token_usage?: {
|
|
4251
|
+
readonly input: number;
|
|
4252
|
+
readonly output: number;
|
|
4253
|
+
readonly cached?: number;
|
|
4254
|
+
readonly reasoning?: number;
|
|
4255
|
+
};
|
|
4256
|
+
readonly transcript_duration_ms?: number;
|
|
4257
|
+
readonly transcript_cost_usd?: number | null;
|
|
4243
4258
|
readonly source: {
|
|
4244
4259
|
readonly provider: string;
|
|
4245
4260
|
readonly session_id: string;
|
|
@@ -4251,9 +4266,28 @@ interface TranscriptJsonLine {
|
|
|
4251
4266
|
};
|
|
4252
4267
|
}
|
|
4253
4268
|
/**
|
|
4254
|
-
*
|
|
4269
|
+
* Grouped replayable transcript reconstructed from per-message rows.
|
|
4270
|
+
*/
|
|
4271
|
+
interface TranscriptReplayEntry {
|
|
4272
|
+
readonly testId: string;
|
|
4273
|
+
readonly target: string;
|
|
4274
|
+
readonly messages: readonly Message[];
|
|
4275
|
+
readonly tokenUsage?: ProviderTokenUsage;
|
|
4276
|
+
readonly durationMs?: number;
|
|
4277
|
+
readonly costUsd?: number | null;
|
|
4278
|
+
readonly source: TranscriptSource;
|
|
4279
|
+
}
|
|
4280
|
+
/**
|
|
4281
|
+
* Convert a parsed TranscriptEntry to per-message JSONL rows.
|
|
4282
|
+
*/
|
|
4283
|
+
declare function toTranscriptJsonLines(entry: TranscriptEntry, options?: {
|
|
4284
|
+
testId?: string;
|
|
4285
|
+
target?: string;
|
|
4286
|
+
}): TranscriptJsonLine[];
|
|
4287
|
+
/**
|
|
4288
|
+
* Group per-message transcript rows back into replayable conversations.
|
|
4255
4289
|
*/
|
|
4256
|
-
declare function
|
|
4290
|
+
declare function groupTranscriptJsonLines(lines: readonly TranscriptJsonLine[]): TranscriptReplayEntry[];
|
|
4257
4291
|
/**
|
|
4258
4292
|
* Read a transcript JSONL file and parse each line into a TranscriptJsonLine.
|
|
4259
4293
|
*/
|
|
@@ -4411,9 +4445,9 @@ declare class TranscriptProvider implements Provider {
|
|
|
4411
4445
|
readonly id: string;
|
|
4412
4446
|
readonly kind: "transcript";
|
|
4413
4447
|
readonly targetName: string;
|
|
4414
|
-
private
|
|
4448
|
+
private entries;
|
|
4415
4449
|
private cursor;
|
|
4416
|
-
constructor(targetName: string,
|
|
4450
|
+
constructor(targetName: string, entries: TranscriptReplayEntry[]);
|
|
4417
4451
|
/**
|
|
4418
4452
|
* Create a TranscriptProvider from a JSONL file path.
|
|
4419
4453
|
*/
|
|
@@ -4467,4 +4501,4 @@ type AgentKernel = {
|
|
|
4467
4501
|
};
|
|
4468
4502
|
declare function createAgentKernel(): AgentKernel;
|
|
4469
4503
|
|
|
4470
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep,
|
|
4504
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -4193,11 +4193,11 @@ declare function discoverGraders(registry: EvaluatorRegistry, baseDir: string):
|
|
|
4193
4193
|
*
|
|
4194
4194
|
* Flow:
|
|
4195
4195
|
* raw session JSONL → parser → TranscriptEntry (internal)
|
|
4196
|
-
* TranscriptEntry →
|
|
4196
|
+
* TranscriptEntry → toTranscriptJsonLines() → JSONL on disk
|
|
4197
4197
|
* JSONL on disk → readTranscriptJsonl() → TranscriptJsonLine[]
|
|
4198
4198
|
*
|
|
4199
4199
|
* To add a new importer: write a parser that returns TranscriptEntry,
|
|
4200
|
-
* then use
|
|
4200
|
+
* then use toTranscriptJsonLines() to serialize.
|
|
4201
4201
|
*/
|
|
4202
4202
|
|
|
4203
4203
|
/**
|
|
@@ -4226,20 +4226,35 @@ interface TranscriptSource {
|
|
|
4226
4226
|
/**
|
|
4227
4227
|
* One line in a transcript JSONL file (snake_case wire format).
|
|
4228
4228
|
*
|
|
4229
|
-
* Each line
|
|
4230
|
-
*
|
|
4231
|
-
* full conversation (Message[]).
|
|
4229
|
+
* Each line captures one message within an ordered per-test transcript.
|
|
4230
|
+
* Consumers group all rows with the same `test_id` into a replayable session.
|
|
4232
4231
|
*/
|
|
4233
4232
|
interface TranscriptJsonLine {
|
|
4234
|
-
readonly
|
|
4235
|
-
readonly
|
|
4233
|
+
readonly test_id: string;
|
|
4234
|
+
readonly target: string;
|
|
4235
|
+
readonly message_index: number;
|
|
4236
|
+
readonly role: string;
|
|
4237
|
+
readonly name?: string;
|
|
4238
|
+
readonly content?: unknown;
|
|
4239
|
+
readonly tool_calls?: readonly Record<string, unknown>[];
|
|
4240
|
+
readonly start_time?: string;
|
|
4241
|
+
readonly end_time?: string;
|
|
4242
|
+
readonly duration_ms?: number;
|
|
4243
|
+
readonly metadata?: Record<string, unknown>;
|
|
4236
4244
|
readonly token_usage?: {
|
|
4237
4245
|
readonly input: number;
|
|
4238
4246
|
readonly output: number;
|
|
4239
4247
|
readonly cached?: number;
|
|
4248
|
+
readonly reasoning?: number;
|
|
4240
4249
|
};
|
|
4241
|
-
readonly
|
|
4242
|
-
|
|
4250
|
+
readonly transcript_token_usage?: {
|
|
4251
|
+
readonly input: number;
|
|
4252
|
+
readonly output: number;
|
|
4253
|
+
readonly cached?: number;
|
|
4254
|
+
readonly reasoning?: number;
|
|
4255
|
+
};
|
|
4256
|
+
readonly transcript_duration_ms?: number;
|
|
4257
|
+
readonly transcript_cost_usd?: number | null;
|
|
4243
4258
|
readonly source: {
|
|
4244
4259
|
readonly provider: string;
|
|
4245
4260
|
readonly session_id: string;
|
|
@@ -4251,9 +4266,28 @@ interface TranscriptJsonLine {
|
|
|
4251
4266
|
};
|
|
4252
4267
|
}
|
|
4253
4268
|
/**
|
|
4254
|
-
*
|
|
4269
|
+
* Grouped replayable transcript reconstructed from per-message rows.
|
|
4270
|
+
*/
|
|
4271
|
+
interface TranscriptReplayEntry {
|
|
4272
|
+
readonly testId: string;
|
|
4273
|
+
readonly target: string;
|
|
4274
|
+
readonly messages: readonly Message[];
|
|
4275
|
+
readonly tokenUsage?: ProviderTokenUsage;
|
|
4276
|
+
readonly durationMs?: number;
|
|
4277
|
+
readonly costUsd?: number | null;
|
|
4278
|
+
readonly source: TranscriptSource;
|
|
4279
|
+
}
|
|
4280
|
+
/**
|
|
4281
|
+
* Convert a parsed TranscriptEntry to per-message JSONL rows.
|
|
4282
|
+
*/
|
|
4283
|
+
declare function toTranscriptJsonLines(entry: TranscriptEntry, options?: {
|
|
4284
|
+
testId?: string;
|
|
4285
|
+
target?: string;
|
|
4286
|
+
}): TranscriptJsonLine[];
|
|
4287
|
+
/**
|
|
4288
|
+
* Group per-message transcript rows back into replayable conversations.
|
|
4255
4289
|
*/
|
|
4256
|
-
declare function
|
|
4290
|
+
declare function groupTranscriptJsonLines(lines: readonly TranscriptJsonLine[]): TranscriptReplayEntry[];
|
|
4257
4291
|
/**
|
|
4258
4292
|
* Read a transcript JSONL file and parse each line into a TranscriptJsonLine.
|
|
4259
4293
|
*/
|
|
@@ -4411,9 +4445,9 @@ declare class TranscriptProvider implements Provider {
|
|
|
4411
4445
|
readonly id: string;
|
|
4412
4446
|
readonly kind: "transcript";
|
|
4413
4447
|
readonly targetName: string;
|
|
4414
|
-
private
|
|
4448
|
+
private entries;
|
|
4415
4449
|
private cursor;
|
|
4416
|
-
constructor(targetName: string,
|
|
4450
|
+
constructor(targetName: string, entries: TranscriptReplayEntry[]);
|
|
4417
4451
|
/**
|
|
4418
4452
|
* Create a TranscriptProvider from a JSONL file path.
|
|
4419
4453
|
*/
|
|
@@ -4467,4 +4501,4 @@ type AgentKernel = {
|
|
|
4467
4501
|
};
|
|
4468
4502
|
declare function createAgentKernel(): AgentKernel;
|
|
4469
4503
|
|
|
4470
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep,
|
|
4504
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.js
CHANGED
|
@@ -3894,7 +3894,21 @@ async function resolveWorkspaceConfig(raw, evalFileDir) {
|
|
|
3894
3894
|
);
|
|
3895
3895
|
}
|
|
3896
3896
|
const workspaceFileDir = path8.dirname(workspaceFilePath);
|
|
3897
|
-
|
|
3897
|
+
const resolvedWorkspace = parseWorkspaceConfig(parsed, workspaceFileDir);
|
|
3898
|
+
if (resolvedWorkspace) {
|
|
3899
|
+
return resolvedWorkspace;
|
|
3900
|
+
}
|
|
3901
|
+
const parsedObject = parsed;
|
|
3902
|
+
if ("workspace" in parsedObject && isJsonObject(parsedObject.workspace)) {
|
|
3903
|
+
throw new Error(
|
|
3904
|
+
[
|
|
3905
|
+
`Invalid workspace file format: ${workspaceFilePath}`,
|
|
3906
|
+
"External workspace files must contain the workspace config object directly.",
|
|
3907
|
+
'Remove the top-level "workspace:" wrapper.'
|
|
3908
|
+
].join(" ")
|
|
3909
|
+
);
|
|
3910
|
+
}
|
|
3911
|
+
return void 0;
|
|
3898
3912
|
}
|
|
3899
3913
|
return parseWorkspaceConfig(raw, evalFileDir);
|
|
3900
3914
|
}
|
|
@@ -21232,30 +21246,91 @@ async function discoverClaudeSessions(opts) {
|
|
|
21232
21246
|
|
|
21233
21247
|
// src/import/types.ts
|
|
21234
21248
|
import { readFile as readFile18 } from "node:fs/promises";
|
|
21235
|
-
function
|
|
21236
|
-
const
|
|
21237
|
-
|
|
21249
|
+
function toTranscriptJsonLines(entry, options) {
|
|
21250
|
+
const source = {
|
|
21251
|
+
provider: entry.source.provider,
|
|
21252
|
+
session_id: entry.source.sessionId,
|
|
21253
|
+
model: entry.source.model,
|
|
21254
|
+
timestamp: entry.source.startedAt,
|
|
21255
|
+
git_branch: entry.source.gitBranch,
|
|
21256
|
+
cwd: entry.source.cwd ?? entry.source.projectPath,
|
|
21257
|
+
version: entry.source.version
|
|
21258
|
+
};
|
|
21259
|
+
const transcriptTokenUsage = entry.tokenUsage ? {
|
|
21260
|
+
input: entry.tokenUsage.input,
|
|
21261
|
+
output: entry.tokenUsage.output,
|
|
21262
|
+
cached: entry.tokenUsage.cached,
|
|
21263
|
+
reasoning: entry.tokenUsage.reasoning
|
|
21264
|
+
} : void 0;
|
|
21265
|
+
const testId = options?.testId ?? entry.source.sessionId;
|
|
21266
|
+
const target = options?.target ?? entry.source.provider;
|
|
21267
|
+
return entry.messages.map((message, index) => ({
|
|
21268
|
+
test_id: testId,
|
|
21269
|
+
target,
|
|
21270
|
+
message_index: index,
|
|
21271
|
+
...toSnakeCaseDeep(message),
|
|
21272
|
+
transcript_token_usage: transcriptTokenUsage,
|
|
21273
|
+
transcript_duration_ms: entry.durationMs,
|
|
21274
|
+
transcript_cost_usd: entry.costUsd,
|
|
21275
|
+
source
|
|
21276
|
+
}));
|
|
21277
|
+
}
|
|
21278
|
+
function buildReplayMessage(line) {
|
|
21279
|
+
const camelCased = toCamelCaseDeep(line);
|
|
21238
21280
|
return {
|
|
21239
|
-
|
|
21240
|
-
|
|
21241
|
-
|
|
21242
|
-
|
|
21243
|
-
|
|
21244
|
-
|
|
21245
|
-
|
|
21246
|
-
|
|
21247
|
-
|
|
21248
|
-
source: {
|
|
21249
|
-
provider: entry.source.provider,
|
|
21250
|
-
session_id: entry.source.sessionId,
|
|
21251
|
-
model: entry.source.model,
|
|
21252
|
-
timestamp: entry.source.startedAt,
|
|
21253
|
-
git_branch: entry.source.gitBranch,
|
|
21254
|
-
cwd: entry.source.cwd ?? entry.source.projectPath,
|
|
21255
|
-
version: entry.source.version
|
|
21256
|
-
}
|
|
21281
|
+
role: camelCased.role,
|
|
21282
|
+
name: camelCased.name,
|
|
21283
|
+
content: camelCased.content,
|
|
21284
|
+
toolCalls: camelCased.toolCalls,
|
|
21285
|
+
startTime: camelCased.startTime,
|
|
21286
|
+
endTime: camelCased.endTime,
|
|
21287
|
+
durationMs: camelCased.durationMs,
|
|
21288
|
+
metadata: camelCased.metadata,
|
|
21289
|
+
tokenUsage: camelCased.tokenUsage
|
|
21257
21290
|
};
|
|
21258
21291
|
}
|
|
21292
|
+
function groupTranscriptJsonLines(lines) {
|
|
21293
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
21294
|
+
for (const line of lines) {
|
|
21295
|
+
const existing = grouped.get(line.test_id);
|
|
21296
|
+
const source = {
|
|
21297
|
+
provider: line.source.provider,
|
|
21298
|
+
sessionId: line.source.session_id,
|
|
21299
|
+
startedAt: line.source.timestamp,
|
|
21300
|
+
model: line.source.model,
|
|
21301
|
+
gitBranch: line.source.git_branch,
|
|
21302
|
+
cwd: line.source.cwd,
|
|
21303
|
+
version: line.source.version
|
|
21304
|
+
};
|
|
21305
|
+
const transcriptTokenUsage = line.transcript_token_usage ? {
|
|
21306
|
+
input: line.transcript_token_usage.input,
|
|
21307
|
+
output: line.transcript_token_usage.output,
|
|
21308
|
+
cached: line.transcript_token_usage.cached,
|
|
21309
|
+
reasoning: line.transcript_token_usage.reasoning
|
|
21310
|
+
} : void 0;
|
|
21311
|
+
if (existing) {
|
|
21312
|
+
existing.messages.push({ index: line.message_index, message: buildReplayMessage(line) });
|
|
21313
|
+
continue;
|
|
21314
|
+
}
|
|
21315
|
+
grouped.set(line.test_id, {
|
|
21316
|
+
target: line.target,
|
|
21317
|
+
tokenUsage: transcriptTokenUsage,
|
|
21318
|
+
durationMs: line.transcript_duration_ms,
|
|
21319
|
+
costUsd: line.transcript_cost_usd,
|
|
21320
|
+
source,
|
|
21321
|
+
messages: [{ index: line.message_index, message: buildReplayMessage(line) }]
|
|
21322
|
+
});
|
|
21323
|
+
}
|
|
21324
|
+
return [...grouped.entries()].map(([testId, entry]) => ({
|
|
21325
|
+
testId,
|
|
21326
|
+
target: entry.target,
|
|
21327
|
+
tokenUsage: entry.tokenUsage,
|
|
21328
|
+
durationMs: entry.durationMs,
|
|
21329
|
+
costUsd: entry.costUsd,
|
|
21330
|
+
source: entry.source,
|
|
21331
|
+
messages: entry.messages.sort((first, second) => first.index - second.index).map((item) => item.message)
|
|
21332
|
+
}));
|
|
21333
|
+
}
|
|
21259
21334
|
async function readTranscriptJsonl(filePath) {
|
|
21260
21335
|
const text = await readFile18(filePath, "utf8");
|
|
21261
21336
|
return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
@@ -21269,12 +21344,12 @@ var TranscriptProvider = class _TranscriptProvider {
|
|
|
21269
21344
|
id;
|
|
21270
21345
|
kind = "transcript";
|
|
21271
21346
|
targetName;
|
|
21272
|
-
|
|
21347
|
+
entries;
|
|
21273
21348
|
cursor = 0;
|
|
21274
|
-
constructor(targetName,
|
|
21349
|
+
constructor(targetName, entries) {
|
|
21275
21350
|
this.targetName = targetName;
|
|
21276
21351
|
this.id = `transcript:${targetName}`;
|
|
21277
|
-
this.
|
|
21352
|
+
this.entries = entries;
|
|
21278
21353
|
}
|
|
21279
21354
|
/**
|
|
21280
21355
|
* Create a TranscriptProvider from a JSONL file path.
|
|
@@ -21284,29 +21359,31 @@ var TranscriptProvider = class _TranscriptProvider {
|
|
|
21284
21359
|
if (lines.length === 0) {
|
|
21285
21360
|
throw new Error(`Transcript file is empty: ${filePath}`);
|
|
21286
21361
|
}
|
|
21287
|
-
const
|
|
21288
|
-
|
|
21362
|
+
const entries = groupTranscriptJsonLines(lines);
|
|
21363
|
+
const providerName = entries[0]?.source.provider ?? "transcript";
|
|
21364
|
+
return new _TranscriptProvider(providerName, entries);
|
|
21289
21365
|
}
|
|
21290
21366
|
get lineCount() {
|
|
21291
|
-
return this.
|
|
21367
|
+
return this.entries.length;
|
|
21292
21368
|
}
|
|
21293
21369
|
async invoke(_request) {
|
|
21294
|
-
if (this.cursor >= this.
|
|
21370
|
+
if (this.cursor >= this.entries.length) {
|
|
21295
21371
|
throw new Error(
|
|
21296
|
-
`Transcript exhausted: ${this.
|
|
21372
|
+
`Transcript exhausted: ${this.entries.length} entr${this.entries.length === 1 ? "y" : "ies"} available but ${this.cursor + 1} invocations attempted. Each transcript entry maps to one test case.`
|
|
21297
21373
|
);
|
|
21298
21374
|
}
|
|
21299
|
-
const
|
|
21375
|
+
const entry = this.entries[this.cursor++];
|
|
21300
21376
|
return {
|
|
21301
|
-
output:
|
|
21302
|
-
tokenUsage:
|
|
21303
|
-
input:
|
|
21304
|
-
output:
|
|
21305
|
-
cached:
|
|
21377
|
+
output: entry.messages,
|
|
21378
|
+
tokenUsage: entry.tokenUsage ? {
|
|
21379
|
+
input: entry.tokenUsage.input,
|
|
21380
|
+
output: entry.tokenUsage.output,
|
|
21381
|
+
cached: entry.tokenUsage.cached,
|
|
21382
|
+
reasoning: entry.tokenUsage.reasoning
|
|
21306
21383
|
} : void 0,
|
|
21307
|
-
durationMs:
|
|
21308
|
-
costUsd:
|
|
21309
|
-
startTime:
|
|
21384
|
+
durationMs: entry.durationMs,
|
|
21385
|
+
costUsd: entry.costUsd ?? void 0,
|
|
21386
|
+
startTime: entry.source.startedAt
|
|
21310
21387
|
};
|
|
21311
21388
|
}
|
|
21312
21389
|
};
|
|
@@ -21427,6 +21504,7 @@ export {
|
|
|
21427
21504
|
getWorkspacePath,
|
|
21428
21505
|
getWorkspacePoolRoot,
|
|
21429
21506
|
getWorkspacesRoot,
|
|
21507
|
+
groupTranscriptJsonLines,
|
|
21430
21508
|
initializeBaseline,
|
|
21431
21509
|
isAgentSkillsFormat,
|
|
21432
21510
|
isContent,
|
|
@@ -21502,7 +21580,7 @@ export {
|
|
|
21502
21580
|
syncResultsRepo,
|
|
21503
21581
|
toCamelCaseDeep,
|
|
21504
21582
|
toSnakeCaseDeep,
|
|
21505
|
-
|
|
21583
|
+
toTranscriptJsonLines,
|
|
21506
21584
|
tokensPerTool,
|
|
21507
21585
|
touchBenchmark,
|
|
21508
21586
|
transpileEvalYaml,
|