@agentv/core 0.11.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-YQBJAT5I.js → chunk-U3GEJ3K7.js} +1 -1
- package/dist/{chunk-YQBJAT5I.js.map → chunk-U3GEJ3K7.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +675 -562
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +29 -26
- package/dist/index.d.ts +29 -26
- package/dist/index.js +707 -592
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -117,6 +117,7 @@ interface EvalCase {
|
|
|
117
117
|
* Evaluator scorecard for a single eval case run.
|
|
118
118
|
*/
|
|
119
119
|
interface EvaluationResult {
|
|
120
|
+
readonly timestamp: string;
|
|
120
121
|
readonly eval_id: string;
|
|
121
122
|
readonly dataset?: string;
|
|
122
123
|
readonly conversation_id?: string;
|
|
@@ -124,14 +125,12 @@ interface EvaluationResult {
|
|
|
124
125
|
readonly hits: readonly string[];
|
|
125
126
|
readonly misses: readonly string[];
|
|
126
127
|
readonly candidate_answer: string;
|
|
127
|
-
readonly expected_aspect_count: number;
|
|
128
128
|
readonly target: string;
|
|
129
|
-
readonly timestamp: string;
|
|
130
129
|
readonly reasoning?: string;
|
|
131
130
|
readonly raw_aspects?: readonly string[];
|
|
132
131
|
readonly agent_provider_request?: JsonObject;
|
|
133
132
|
readonly lm_provider_request?: JsonObject;
|
|
134
|
-
readonly
|
|
133
|
+
readonly evaluator_provider_request?: JsonObject;
|
|
135
134
|
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
136
135
|
readonly error?: string;
|
|
137
136
|
}
|
|
@@ -143,7 +142,7 @@ interface EvaluatorResult {
|
|
|
143
142
|
readonly misses: readonly string[];
|
|
144
143
|
readonly reasoning?: string;
|
|
145
144
|
readonly raw_request?: JsonObject;
|
|
146
|
-
readonly
|
|
145
|
+
readonly evaluator_provider_request?: JsonObject;
|
|
147
146
|
}
|
|
148
147
|
/**
|
|
149
148
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -154,6 +153,7 @@ type ChatPrompt = AxChatRequest["chatPrompt"];
|
|
|
154
153
|
type ProviderKind = "azure" | "anthropic" | "gemini" | "codex" | "cli" | "mock" | "vscode" | "vscode-insiders";
|
|
155
154
|
interface ProviderRequest {
|
|
156
155
|
readonly question: string;
|
|
156
|
+
readonly systemPrompt?: string;
|
|
157
157
|
readonly guidelines?: string;
|
|
158
158
|
readonly guideline_patterns?: readonly string[];
|
|
159
159
|
readonly chatPrompt?: ChatPrompt;
|
|
@@ -264,38 +264,41 @@ interface TargetDefinition {
|
|
|
264
264
|
}
|
|
265
265
|
|
|
266
266
|
/**
|
|
267
|
-
*
|
|
268
|
-
* This is a convenience function for CLI tools that need metadata without loading all eval cases.
|
|
269
|
-
*/
|
|
270
|
-
declare function readTestSuiteMetadata(testFilePath: string): Promise<{
|
|
271
|
-
target?: string;
|
|
272
|
-
}>;
|
|
273
|
-
/**
|
|
274
|
-
* Determine whether a path references guideline content (instructions or prompts).
|
|
267
|
+
* Build prompt inputs by consolidating user request context and guideline content.
|
|
275
268
|
*/
|
|
276
|
-
|
|
269
|
+
interface PromptInputs {
|
|
270
|
+
readonly question: string;
|
|
271
|
+
readonly guidelines: string;
|
|
272
|
+
readonly chatPrompt?: ChatPrompt;
|
|
273
|
+
readonly systemMessage?: string;
|
|
274
|
+
}
|
|
275
|
+
declare function buildPromptInputs(testCase: EvalCase): Promise<PromptInputs>;
|
|
276
|
+
|
|
277
277
|
/**
|
|
278
278
|
* Extract fenced code blocks from AgentV user segments.
|
|
279
279
|
*/
|
|
280
280
|
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* Determine whether a path references guideline content (instructions or prompts).
|
|
284
|
+
*/
|
|
285
|
+
declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
|
|
286
|
+
|
|
281
287
|
type LoadOptions = {
|
|
282
288
|
readonly verbose?: boolean;
|
|
283
289
|
readonly evalId?: string;
|
|
284
290
|
};
|
|
285
291
|
/**
|
|
286
|
-
*
|
|
292
|
+
* Read metadata from a test suite file (like target name).
|
|
293
|
+
* This is a convenience function for CLI tools that need metadata without loading all eval cases.
|
|
287
294
|
*/
|
|
288
|
-
declare function
|
|
295
|
+
declare function readTestSuiteMetadata(testFilePath: string): Promise<{
|
|
296
|
+
target?: string;
|
|
297
|
+
}>;
|
|
289
298
|
/**
|
|
290
|
-
*
|
|
299
|
+
* Load eval cases from a AgentV YAML specification file.
|
|
291
300
|
*/
|
|
292
|
-
|
|
293
|
-
readonly question: string;
|
|
294
|
-
readonly guidelines: string;
|
|
295
|
-
readonly chatPrompt?: ChatPrompt;
|
|
296
|
-
readonly systemMessage?: string;
|
|
297
|
-
}
|
|
298
|
-
declare function buildPromptInputs(testCase: EvalCase): Promise<PromptInputs>;
|
|
301
|
+
declare function loadEvalCases(evalFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<readonly EvalCase[]>;
|
|
299
302
|
|
|
300
303
|
declare function fileExists(filePath: string): Promise<boolean>;
|
|
301
304
|
/**
|
|
@@ -500,7 +503,7 @@ interface EvaluationContext {
|
|
|
500
503
|
};
|
|
501
504
|
readonly now: Date;
|
|
502
505
|
readonly judgeProvider?: Provider;
|
|
503
|
-
readonly
|
|
506
|
+
readonly evaluatorTemplateOverride?: string;
|
|
504
507
|
readonly evaluator?: EvaluatorConfig;
|
|
505
508
|
}
|
|
506
509
|
interface EvaluationScore {
|
|
@@ -521,14 +524,14 @@ interface LlmJudgeEvaluatorOptions {
|
|
|
521
524
|
readonly resolveJudgeProvider: JudgeProviderResolver;
|
|
522
525
|
readonly maxOutputTokens?: number;
|
|
523
526
|
readonly temperature?: number;
|
|
524
|
-
readonly
|
|
527
|
+
readonly evaluatorTemplate?: string;
|
|
525
528
|
}
|
|
526
529
|
declare class LlmJudgeEvaluator implements Evaluator {
|
|
527
530
|
readonly kind = "llm_judge";
|
|
528
531
|
private readonly resolveJudgeProvider;
|
|
529
532
|
private readonly maxOutputTokens?;
|
|
530
533
|
private readonly temperature?;
|
|
531
|
-
private readonly
|
|
534
|
+
private readonly evaluatorTemplate?;
|
|
532
535
|
constructor(options: LlmJudgeEvaluatorOptions);
|
|
533
536
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
534
537
|
private evaluateWithPrompt;
|
package/dist/index.d.ts
CHANGED
|
@@ -117,6 +117,7 @@ interface EvalCase {
|
|
|
117
117
|
* Evaluator scorecard for a single eval case run.
|
|
118
118
|
*/
|
|
119
119
|
interface EvaluationResult {
|
|
120
|
+
readonly timestamp: string;
|
|
120
121
|
readonly eval_id: string;
|
|
121
122
|
readonly dataset?: string;
|
|
122
123
|
readonly conversation_id?: string;
|
|
@@ -124,14 +125,12 @@ interface EvaluationResult {
|
|
|
124
125
|
readonly hits: readonly string[];
|
|
125
126
|
readonly misses: readonly string[];
|
|
126
127
|
readonly candidate_answer: string;
|
|
127
|
-
readonly expected_aspect_count: number;
|
|
128
128
|
readonly target: string;
|
|
129
|
-
readonly timestamp: string;
|
|
130
129
|
readonly reasoning?: string;
|
|
131
130
|
readonly raw_aspects?: readonly string[];
|
|
132
131
|
readonly agent_provider_request?: JsonObject;
|
|
133
132
|
readonly lm_provider_request?: JsonObject;
|
|
134
|
-
readonly
|
|
133
|
+
readonly evaluator_provider_request?: JsonObject;
|
|
135
134
|
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
136
135
|
readonly error?: string;
|
|
137
136
|
}
|
|
@@ -143,7 +142,7 @@ interface EvaluatorResult {
|
|
|
143
142
|
readonly misses: readonly string[];
|
|
144
143
|
readonly reasoning?: string;
|
|
145
144
|
readonly raw_request?: JsonObject;
|
|
146
|
-
readonly
|
|
145
|
+
readonly evaluator_provider_request?: JsonObject;
|
|
147
146
|
}
|
|
148
147
|
/**
|
|
149
148
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -154,6 +153,7 @@ type ChatPrompt = AxChatRequest["chatPrompt"];
|
|
|
154
153
|
type ProviderKind = "azure" | "anthropic" | "gemini" | "codex" | "cli" | "mock" | "vscode" | "vscode-insiders";
|
|
155
154
|
interface ProviderRequest {
|
|
156
155
|
readonly question: string;
|
|
156
|
+
readonly systemPrompt?: string;
|
|
157
157
|
readonly guidelines?: string;
|
|
158
158
|
readonly guideline_patterns?: readonly string[];
|
|
159
159
|
readonly chatPrompt?: ChatPrompt;
|
|
@@ -264,38 +264,41 @@ interface TargetDefinition {
|
|
|
264
264
|
}
|
|
265
265
|
|
|
266
266
|
/**
|
|
267
|
-
*
|
|
268
|
-
* This is a convenience function for CLI tools that need metadata without loading all eval cases.
|
|
269
|
-
*/
|
|
270
|
-
declare function readTestSuiteMetadata(testFilePath: string): Promise<{
|
|
271
|
-
target?: string;
|
|
272
|
-
}>;
|
|
273
|
-
/**
|
|
274
|
-
* Determine whether a path references guideline content (instructions or prompts).
|
|
267
|
+
* Build prompt inputs by consolidating user request context and guideline content.
|
|
275
268
|
*/
|
|
276
|
-
|
|
269
|
+
interface PromptInputs {
|
|
270
|
+
readonly question: string;
|
|
271
|
+
readonly guidelines: string;
|
|
272
|
+
readonly chatPrompt?: ChatPrompt;
|
|
273
|
+
readonly systemMessage?: string;
|
|
274
|
+
}
|
|
275
|
+
declare function buildPromptInputs(testCase: EvalCase): Promise<PromptInputs>;
|
|
276
|
+
|
|
277
277
|
/**
|
|
278
278
|
* Extract fenced code blocks from AgentV user segments.
|
|
279
279
|
*/
|
|
280
280
|
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* Determine whether a path references guideline content (instructions or prompts).
|
|
284
|
+
*/
|
|
285
|
+
declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
|
|
286
|
+
|
|
281
287
|
type LoadOptions = {
|
|
282
288
|
readonly verbose?: boolean;
|
|
283
289
|
readonly evalId?: string;
|
|
284
290
|
};
|
|
285
291
|
/**
|
|
286
|
-
*
|
|
292
|
+
* Read metadata from a test suite file (like target name).
|
|
293
|
+
* This is a convenience function for CLI tools that need metadata without loading all eval cases.
|
|
287
294
|
*/
|
|
288
|
-
declare function
|
|
295
|
+
declare function readTestSuiteMetadata(testFilePath: string): Promise<{
|
|
296
|
+
target?: string;
|
|
297
|
+
}>;
|
|
289
298
|
/**
|
|
290
|
-
*
|
|
299
|
+
* Load eval cases from a AgentV YAML specification file.
|
|
291
300
|
*/
|
|
292
|
-
|
|
293
|
-
readonly question: string;
|
|
294
|
-
readonly guidelines: string;
|
|
295
|
-
readonly chatPrompt?: ChatPrompt;
|
|
296
|
-
readonly systemMessage?: string;
|
|
297
|
-
}
|
|
298
|
-
declare function buildPromptInputs(testCase: EvalCase): Promise<PromptInputs>;
|
|
301
|
+
declare function loadEvalCases(evalFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<readonly EvalCase[]>;
|
|
299
302
|
|
|
300
303
|
declare function fileExists(filePath: string): Promise<boolean>;
|
|
301
304
|
/**
|
|
@@ -500,7 +503,7 @@ interface EvaluationContext {
|
|
|
500
503
|
};
|
|
501
504
|
readonly now: Date;
|
|
502
505
|
readonly judgeProvider?: Provider;
|
|
503
|
-
readonly
|
|
506
|
+
readonly evaluatorTemplateOverride?: string;
|
|
504
507
|
readonly evaluator?: EvaluatorConfig;
|
|
505
508
|
}
|
|
506
509
|
interface EvaluationScore {
|
|
@@ -521,14 +524,14 @@ interface LlmJudgeEvaluatorOptions {
|
|
|
521
524
|
readonly resolveJudgeProvider: JudgeProviderResolver;
|
|
522
525
|
readonly maxOutputTokens?: number;
|
|
523
526
|
readonly temperature?: number;
|
|
524
|
-
readonly
|
|
527
|
+
readonly evaluatorTemplate?: string;
|
|
525
528
|
}
|
|
526
529
|
declare class LlmJudgeEvaluator implements Evaluator {
|
|
527
530
|
readonly kind = "llm_judge";
|
|
528
531
|
private readonly resolveJudgeProvider;
|
|
529
532
|
private readonly maxOutputTokens?;
|
|
530
533
|
private readonly temperature?;
|
|
531
|
-
private readonly
|
|
534
|
+
private readonly evaluatorTemplate?;
|
|
532
535
|
constructor(options: LlmJudgeEvaluatorOptions);
|
|
533
536
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
534
537
|
private evaluateWithPrompt;
|