@agentv/core 2.1.1 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-KDEP4I7G.js → chunk-RP3M7COZ.js} +1 -1
- package/dist/{chunk-KDEP4I7G.js.map → chunk-RP3M7COZ.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs +38 -4
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +39 -5
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1070 -281
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +82 -7
- package/dist/index.d.ts +82 -7
- package/dist/index.js +1018 -230
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -60,6 +60,8 @@ interface ToolTrajectoryExpectedItem {
|
|
|
60
60
|
readonly tool: string;
|
|
61
61
|
/** Optional argument matching: 'any' skips validation, object performs partial deep equality */
|
|
62
62
|
readonly args?: 'any' | Record<string, unknown>;
|
|
63
|
+
/** Optional maximum duration in milliseconds for latency assertions */
|
|
64
|
+
readonly maxDurationMs?: number;
|
|
63
65
|
}
|
|
64
66
|
/**
|
|
65
67
|
* Simplified input type for computeTraceSummary.
|
|
@@ -226,19 +228,71 @@ type CodeEvaluatorConfig = {
|
|
|
226
228
|
/** When present, enables target access for the script via local proxy */
|
|
227
229
|
readonly target?: TargetAccessConfig;
|
|
228
230
|
};
|
|
231
|
+
/**
|
|
232
|
+
* Executable prompt template configuration.
|
|
233
|
+
* Matches code_judge pattern for consistency.
|
|
234
|
+
*/
|
|
235
|
+
type PromptScriptConfig = {
|
|
236
|
+
/** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
|
|
237
|
+
readonly script: readonly string[];
|
|
238
|
+
/** Pass-through configuration for the prompt template */
|
|
239
|
+
readonly config?: Record<string, unknown>;
|
|
240
|
+
};
|
|
229
241
|
type LlmJudgeEvaluatorConfig = {
|
|
230
242
|
readonly name: string;
|
|
231
243
|
readonly type: 'llm_judge';
|
|
232
|
-
|
|
244
|
+
/** Text prompt (inline or file path) or executable script config */
|
|
245
|
+
readonly prompt?: string | PromptScriptConfig;
|
|
233
246
|
readonly promptPath?: string;
|
|
247
|
+
/** Resolved absolute path for prompt file (used for text template prompts) */
|
|
248
|
+
readonly resolvedPromptPath?: string;
|
|
249
|
+
/** Resolved script array for executable prompts (matches code_judge pattern) */
|
|
250
|
+
readonly resolvedPromptScript?: readonly string[];
|
|
234
251
|
readonly rubrics?: readonly RubricItem[];
|
|
235
252
|
readonly weight?: number;
|
|
253
|
+
/** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
|
|
254
|
+
readonly config?: Record<string, unknown>;
|
|
255
|
+
};
|
|
256
|
+
/**
|
|
257
|
+
* Score range definition for analytic rubric scoring.
|
|
258
|
+
* Each range maps an integer score band (0-10) to an expected outcome description.
|
|
259
|
+
*/
|
|
260
|
+
type ScoreRange = {
|
|
261
|
+
/** Inclusive integer range [min, max] within 0-10 */
|
|
262
|
+
readonly score_range: readonly [number, number];
|
|
263
|
+
/** Description of what this score range represents */
|
|
264
|
+
readonly expected_outcome: string;
|
|
236
265
|
};
|
|
266
|
+
/**
|
|
267
|
+
* Rubric item for LLM judge evaluation.
|
|
268
|
+
* Supports two modes:
|
|
269
|
+
* - Checklist mode: boolean satisfied/not-satisfied with `expected_outcome`
|
|
270
|
+
* - Score-range mode: 0-10 integer scoring with `score_ranges`
|
|
271
|
+
*/
|
|
237
272
|
type RubricItem = {
|
|
238
273
|
readonly id: string;
|
|
239
|
-
|
|
274
|
+
/**
|
|
275
|
+
* For checklist rubrics: the expected outcome text (required).
|
|
276
|
+
* For score-range rubrics: optional overall criterion description.
|
|
277
|
+
*/
|
|
278
|
+
readonly expected_outcome?: string;
|
|
240
279
|
readonly weight: number;
|
|
241
|
-
|
|
280
|
+
/**
|
|
281
|
+
* Legacy boolean gating (deprecated, treated as required_min_score: 10).
|
|
282
|
+
* Use required_min_score instead for finer control.
|
|
283
|
+
*/
|
|
284
|
+
readonly required?: boolean;
|
|
285
|
+
/**
|
|
286
|
+
* Minimum score (0-10) required to pass this criterion.
|
|
287
|
+
* If the criterion score is below this threshold, the overall verdict is 'fail'.
|
|
288
|
+
*/
|
|
289
|
+
readonly required_min_score?: number;
|
|
290
|
+
/**
|
|
291
|
+
* Score range definitions for analytic rubric scoring.
|
|
292
|
+
* When present, the judge outputs an integer 0-10 score per criterion.
|
|
293
|
+
* Ranges must be non-overlapping and cover 0-10 inclusive.
|
|
294
|
+
*/
|
|
295
|
+
readonly score_ranges?: readonly ScoreRange[];
|
|
242
296
|
};
|
|
243
297
|
type CompositeAggregatorConfig = {
|
|
244
298
|
readonly type: 'weighted_average';
|
|
@@ -438,6 +492,8 @@ interface ToolCall {
|
|
|
438
492
|
readonly id?: string;
|
|
439
493
|
/** ISO 8601 timestamp */
|
|
440
494
|
readonly timestamp?: string;
|
|
495
|
+
/** Duration of the tool call in milliseconds */
|
|
496
|
+
readonly durationMs?: number;
|
|
441
497
|
}
|
|
442
498
|
/**
|
|
443
499
|
* An output message from agent execution.
|
|
@@ -454,6 +510,8 @@ interface OutputMessage {
|
|
|
454
510
|
readonly toolCalls?: readonly ToolCall[];
|
|
455
511
|
/** ISO 8601 timestamp */
|
|
456
512
|
readonly timestamp?: string;
|
|
513
|
+
/** Duration of the message in milliseconds */
|
|
514
|
+
readonly durationMs?: number;
|
|
457
515
|
/** Provider-specific metadata */
|
|
458
516
|
readonly metadata?: Record<string, unknown>;
|
|
459
517
|
}
|
|
@@ -601,9 +659,15 @@ declare function buildPromptInputs(testCase: EvalCase, mode?: FormattingMode): P
|
|
|
601
659
|
*/
|
|
602
660
|
declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
|
|
603
661
|
|
|
662
|
+
/**
|
|
663
|
+
* Detect file format by extension.
|
|
664
|
+
*/
|
|
665
|
+
declare function detectFormat(filePath: string): 'yaml' | 'jsonl';
|
|
666
|
+
|
|
604
667
|
type LoadOptions = {
|
|
605
668
|
readonly verbose?: boolean;
|
|
606
|
-
|
|
669
|
+
/** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
|
|
670
|
+
readonly filter?: string;
|
|
607
671
|
};
|
|
608
672
|
/**
|
|
609
673
|
* Read metadata from a test suite file (like target name).
|
|
@@ -613,7 +677,8 @@ declare function readTestSuiteMetadata(testFilePath: string): Promise<{
|
|
|
613
677
|
target?: string;
|
|
614
678
|
}>;
|
|
615
679
|
/**
|
|
616
|
-
* Load eval cases from a AgentV YAML
|
|
680
|
+
* Load eval cases from a AgentV specification file (YAML or JSONL).
|
|
681
|
+
* Format is detected by file extension: .yaml/.yml for YAML, .jsonl for JSONL.
|
|
617
682
|
*/
|
|
618
683
|
declare function loadEvalCases(evalFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<readonly EvalCase[]>;
|
|
619
684
|
|
|
@@ -1186,6 +1251,15 @@ declare class LlmJudgeEvaluator implements Evaluator {
|
|
|
1186
1251
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1187
1252
|
private evaluateFreeform;
|
|
1188
1253
|
private evaluateWithRubrics;
|
|
1254
|
+
/**
|
|
1255
|
+
* Evaluate using score-range rubrics (analytic rubric scoring).
|
|
1256
|
+
* Each criterion is scored 0-10 and normalized to 0-1.
|
|
1257
|
+
*/
|
|
1258
|
+
private evaluateWithScoreRanges;
|
|
1259
|
+
/**
|
|
1260
|
+
* Build prompt for score-range rubric evaluation.
|
|
1261
|
+
*/
|
|
1262
|
+
private buildScoreRangePrompt;
|
|
1189
1263
|
private buildRubricPrompt;
|
|
1190
1264
|
private runWithRetry;
|
|
1191
1265
|
}
|
|
@@ -1275,7 +1349,8 @@ interface RunEvaluationOptions {
|
|
|
1275
1349
|
readonly cache?: EvaluationCache;
|
|
1276
1350
|
readonly useCache?: boolean;
|
|
1277
1351
|
readonly now?: () => Date;
|
|
1278
|
-
|
|
1352
|
+
/** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
|
|
1353
|
+
readonly filter?: string;
|
|
1279
1354
|
readonly verbose?: boolean;
|
|
1280
1355
|
readonly maxConcurrency?: number;
|
|
1281
1356
|
readonly evalCases?: readonly EvalCase[];
|
|
@@ -1301,4 +1376,4 @@ type AgentKernel = {
|
|
|
1301
1376
|
};
|
|
1302
1377
|
declare function createAgentKernel(): AgentKernel;
|
|
1303
1378
|
|
|
1304
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
|
|
1379
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, detectFormat, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
|
package/dist/index.d.ts
CHANGED
|
@@ -60,6 +60,8 @@ interface ToolTrajectoryExpectedItem {
|
|
|
60
60
|
readonly tool: string;
|
|
61
61
|
/** Optional argument matching: 'any' skips validation, object performs partial deep equality */
|
|
62
62
|
readonly args?: 'any' | Record<string, unknown>;
|
|
63
|
+
/** Optional maximum duration in milliseconds for latency assertions */
|
|
64
|
+
readonly maxDurationMs?: number;
|
|
63
65
|
}
|
|
64
66
|
/**
|
|
65
67
|
* Simplified input type for computeTraceSummary.
|
|
@@ -226,19 +228,71 @@ type CodeEvaluatorConfig = {
|
|
|
226
228
|
/** When present, enables target access for the script via local proxy */
|
|
227
229
|
readonly target?: TargetAccessConfig;
|
|
228
230
|
};
|
|
231
|
+
/**
|
|
232
|
+
* Executable prompt template configuration.
|
|
233
|
+
* Matches code_judge pattern for consistency.
|
|
234
|
+
*/
|
|
235
|
+
type PromptScriptConfig = {
|
|
236
|
+
/** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
|
|
237
|
+
readonly script: readonly string[];
|
|
238
|
+
/** Pass-through configuration for the prompt template */
|
|
239
|
+
readonly config?: Record<string, unknown>;
|
|
240
|
+
};
|
|
229
241
|
type LlmJudgeEvaluatorConfig = {
|
|
230
242
|
readonly name: string;
|
|
231
243
|
readonly type: 'llm_judge';
|
|
232
|
-
|
|
244
|
+
/** Text prompt (inline or file path) or executable script config */
|
|
245
|
+
readonly prompt?: string | PromptScriptConfig;
|
|
233
246
|
readonly promptPath?: string;
|
|
247
|
+
/** Resolved absolute path for prompt file (used for text template prompts) */
|
|
248
|
+
readonly resolvedPromptPath?: string;
|
|
249
|
+
/** Resolved script array for executable prompts (matches code_judge pattern) */
|
|
250
|
+
readonly resolvedPromptScript?: readonly string[];
|
|
234
251
|
readonly rubrics?: readonly RubricItem[];
|
|
235
252
|
readonly weight?: number;
|
|
253
|
+
/** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
|
|
254
|
+
readonly config?: Record<string, unknown>;
|
|
255
|
+
};
|
|
256
|
+
/**
|
|
257
|
+
* Score range definition for analytic rubric scoring.
|
|
258
|
+
* Each range maps an integer score band (0-10) to an expected outcome description.
|
|
259
|
+
*/
|
|
260
|
+
type ScoreRange = {
|
|
261
|
+
/** Inclusive integer range [min, max] within 0-10 */
|
|
262
|
+
readonly score_range: readonly [number, number];
|
|
263
|
+
/** Description of what this score range represents */
|
|
264
|
+
readonly expected_outcome: string;
|
|
236
265
|
};
|
|
266
|
+
/**
|
|
267
|
+
* Rubric item for LLM judge evaluation.
|
|
268
|
+
* Supports two modes:
|
|
269
|
+
* - Checklist mode: boolean satisfied/not-satisfied with `expected_outcome`
|
|
270
|
+
* - Score-range mode: 0-10 integer scoring with `score_ranges`
|
|
271
|
+
*/
|
|
237
272
|
type RubricItem = {
|
|
238
273
|
readonly id: string;
|
|
239
|
-
|
|
274
|
+
/**
|
|
275
|
+
* For checklist rubrics: the expected outcome text (required).
|
|
276
|
+
* For score-range rubrics: optional overall criterion description.
|
|
277
|
+
*/
|
|
278
|
+
readonly expected_outcome?: string;
|
|
240
279
|
readonly weight: number;
|
|
241
|
-
|
|
280
|
+
/**
|
|
281
|
+
* Legacy boolean gating (deprecated, treated as required_min_score: 10).
|
|
282
|
+
* Use required_min_score instead for finer control.
|
|
283
|
+
*/
|
|
284
|
+
readonly required?: boolean;
|
|
285
|
+
/**
|
|
286
|
+
* Minimum score (0-10) required to pass this criterion.
|
|
287
|
+
* If the criterion score is below this threshold, the overall verdict is 'fail'.
|
|
288
|
+
*/
|
|
289
|
+
readonly required_min_score?: number;
|
|
290
|
+
/**
|
|
291
|
+
* Score range definitions for analytic rubric scoring.
|
|
292
|
+
* When present, the judge outputs an integer 0-10 score per criterion.
|
|
293
|
+
* Ranges must be non-overlapping and cover 0-10 inclusive.
|
|
294
|
+
*/
|
|
295
|
+
readonly score_ranges?: readonly ScoreRange[];
|
|
242
296
|
};
|
|
243
297
|
type CompositeAggregatorConfig = {
|
|
244
298
|
readonly type: 'weighted_average';
|
|
@@ -438,6 +492,8 @@ interface ToolCall {
|
|
|
438
492
|
readonly id?: string;
|
|
439
493
|
/** ISO 8601 timestamp */
|
|
440
494
|
readonly timestamp?: string;
|
|
495
|
+
/** Duration of the tool call in milliseconds */
|
|
496
|
+
readonly durationMs?: number;
|
|
441
497
|
}
|
|
442
498
|
/**
|
|
443
499
|
* An output message from agent execution.
|
|
@@ -454,6 +510,8 @@ interface OutputMessage {
|
|
|
454
510
|
readonly toolCalls?: readonly ToolCall[];
|
|
455
511
|
/** ISO 8601 timestamp */
|
|
456
512
|
readonly timestamp?: string;
|
|
513
|
+
/** Duration of the message in milliseconds */
|
|
514
|
+
readonly durationMs?: number;
|
|
457
515
|
/** Provider-specific metadata */
|
|
458
516
|
readonly metadata?: Record<string, unknown>;
|
|
459
517
|
}
|
|
@@ -601,9 +659,15 @@ declare function buildPromptInputs(testCase: EvalCase, mode?: FormattingMode): P
|
|
|
601
659
|
*/
|
|
602
660
|
declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
|
|
603
661
|
|
|
662
|
+
/**
|
|
663
|
+
* Detect file format by extension.
|
|
664
|
+
*/
|
|
665
|
+
declare function detectFormat(filePath: string): 'yaml' | 'jsonl';
|
|
666
|
+
|
|
604
667
|
type LoadOptions = {
|
|
605
668
|
readonly verbose?: boolean;
|
|
606
|
-
|
|
669
|
+
/** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
|
|
670
|
+
readonly filter?: string;
|
|
607
671
|
};
|
|
608
672
|
/**
|
|
609
673
|
* Read metadata from a test suite file (like target name).
|
|
@@ -613,7 +677,8 @@ declare function readTestSuiteMetadata(testFilePath: string): Promise<{
|
|
|
613
677
|
target?: string;
|
|
614
678
|
}>;
|
|
615
679
|
/**
|
|
616
|
-
* Load eval cases from a AgentV YAML
|
|
680
|
+
* Load eval cases from a AgentV specification file (YAML or JSONL).
|
|
681
|
+
* Format is detected by file extension: .yaml/.yml for YAML, .jsonl for JSONL.
|
|
617
682
|
*/
|
|
618
683
|
declare function loadEvalCases(evalFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<readonly EvalCase[]>;
|
|
619
684
|
|
|
@@ -1186,6 +1251,15 @@ declare class LlmJudgeEvaluator implements Evaluator {
|
|
|
1186
1251
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1187
1252
|
private evaluateFreeform;
|
|
1188
1253
|
private evaluateWithRubrics;
|
|
1254
|
+
/**
|
|
1255
|
+
* Evaluate using score-range rubrics (analytic rubric scoring).
|
|
1256
|
+
* Each criterion is scored 0-10 and normalized to 0-1.
|
|
1257
|
+
*/
|
|
1258
|
+
private evaluateWithScoreRanges;
|
|
1259
|
+
/**
|
|
1260
|
+
* Build prompt for score-range rubric evaluation.
|
|
1261
|
+
*/
|
|
1262
|
+
private buildScoreRangePrompt;
|
|
1189
1263
|
private buildRubricPrompt;
|
|
1190
1264
|
private runWithRetry;
|
|
1191
1265
|
}
|
|
@@ -1275,7 +1349,8 @@ interface RunEvaluationOptions {
|
|
|
1275
1349
|
readonly cache?: EvaluationCache;
|
|
1276
1350
|
readonly useCache?: boolean;
|
|
1277
1351
|
readonly now?: () => Date;
|
|
1278
|
-
|
|
1352
|
+
/** Filter eval cases by ID pattern (glob supported, e.g., "summary-*") */
|
|
1353
|
+
readonly filter?: string;
|
|
1279
1354
|
readonly verbose?: boolean;
|
|
1280
1355
|
readonly maxConcurrency?: number;
|
|
1281
1356
|
readonly evalCases?: readonly EvalCase[];
|
|
@@ -1301,4 +1376,4 @@ type AgentKernel = {
|
|
|
1301
1376
|
};
|
|
1302
1377
|
declare function createAgentKernel(): AgentKernel;
|
|
1303
1378
|
|
|
1304
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
|
|
1379
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, detectFormat, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
|