@agentv/core 0.11.0 → 0.14.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/dist/{chunk-YQBJAT5I.js → chunk-IOCVST3R.js} +1 -1
- package/dist/chunk-IOCVST3R.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +912 -747
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +46 -34
- package/dist/index.d.ts +46 -34
- package/dist/index.js +875 -708
- package/dist/index.js.map +1 -1
- package/package.json +5 -2
- package/dist/chunk-YQBJAT5I.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import { AxChatRequest, AxAI } from '@ax-llm/ax';
|
|
2
|
-
|
|
3
1
|
/**
|
|
4
2
|
* JSON primitive values appearing in AgentV payloads.
|
|
5
3
|
*/
|
|
@@ -117,6 +115,7 @@ interface EvalCase {
|
|
|
117
115
|
* Evaluator scorecard for a single eval case run.
|
|
118
116
|
*/
|
|
119
117
|
interface EvaluationResult {
|
|
118
|
+
readonly timestamp: string;
|
|
120
119
|
readonly eval_id: string;
|
|
121
120
|
readonly dataset?: string;
|
|
122
121
|
readonly conversation_id?: string;
|
|
@@ -124,14 +123,12 @@ interface EvaluationResult {
|
|
|
124
123
|
readonly hits: readonly string[];
|
|
125
124
|
readonly misses: readonly string[];
|
|
126
125
|
readonly candidate_answer: string;
|
|
127
|
-
readonly expected_aspect_count: number;
|
|
128
126
|
readonly target: string;
|
|
129
|
-
readonly timestamp: string;
|
|
130
127
|
readonly reasoning?: string;
|
|
131
128
|
readonly raw_aspects?: readonly string[];
|
|
132
129
|
readonly agent_provider_request?: JsonObject;
|
|
133
130
|
readonly lm_provider_request?: JsonObject;
|
|
134
|
-
readonly
|
|
131
|
+
readonly evaluator_provider_request?: JsonObject;
|
|
135
132
|
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
136
133
|
readonly error?: string;
|
|
137
134
|
}
|
|
@@ -143,17 +140,24 @@ interface EvaluatorResult {
|
|
|
143
140
|
readonly misses: readonly string[];
|
|
144
141
|
readonly reasoning?: string;
|
|
145
142
|
readonly raw_request?: JsonObject;
|
|
146
|
-
readonly
|
|
143
|
+
readonly evaluator_provider_request?: JsonObject;
|
|
147
144
|
}
|
|
148
145
|
/**
|
|
149
146
|
* Convenience accessor matching the Python hit_count property.
|
|
150
147
|
*/
|
|
151
148
|
declare function getHitCount(result: Pick<EvaluationResult, "hits">): number;
|
|
152
149
|
|
|
153
|
-
type
|
|
150
|
+
type ChatMessageRole = "system" | "user" | "assistant" | "tool" | "function";
|
|
151
|
+
interface ChatMessage {
|
|
152
|
+
readonly role: ChatMessageRole;
|
|
153
|
+
readonly content: string;
|
|
154
|
+
readonly name?: string;
|
|
155
|
+
}
|
|
156
|
+
type ChatPrompt = readonly ChatMessage[];
|
|
154
157
|
type ProviderKind = "azure" | "anthropic" | "gemini" | "codex" | "cli" | "mock" | "vscode" | "vscode-insiders";
|
|
155
158
|
interface ProviderRequest {
|
|
156
159
|
readonly question: string;
|
|
160
|
+
readonly systemPrompt?: string;
|
|
157
161
|
readonly guidelines?: string;
|
|
158
162
|
readonly guideline_patterns?: readonly string[];
|
|
159
163
|
readonly chatPrompt?: ChatPrompt;
|
|
@@ -185,11 +189,6 @@ interface Provider {
|
|
|
185
189
|
* the orchestrator may send multiple requests in a single provider session.
|
|
186
190
|
*/
|
|
187
191
|
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
188
|
-
/**
|
|
189
|
-
* Optional access to the underlying AxAI instance.
|
|
190
|
-
* This enables using advanced Ax features like structured output signatures.
|
|
191
|
-
*/
|
|
192
|
-
getAxAI?(): AxAI;
|
|
193
192
|
}
|
|
194
193
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
195
194
|
interface TargetDefinition {
|
|
@@ -264,38 +263,41 @@ interface TargetDefinition {
|
|
|
264
263
|
}
|
|
265
264
|
|
|
266
265
|
/**
|
|
267
|
-
*
|
|
268
|
-
* This is a convenience function for CLI tools that need metadata without loading all eval cases.
|
|
269
|
-
*/
|
|
270
|
-
declare function readTestSuiteMetadata(testFilePath: string): Promise<{
|
|
271
|
-
target?: string;
|
|
272
|
-
}>;
|
|
273
|
-
/**
|
|
274
|
-
* Determine whether a path references guideline content (instructions or prompts).
|
|
266
|
+
* Build prompt inputs by consolidating user request context and guideline content.
|
|
275
267
|
*/
|
|
276
|
-
|
|
268
|
+
interface PromptInputs {
|
|
269
|
+
readonly question: string;
|
|
270
|
+
readonly guidelines: string;
|
|
271
|
+
readonly chatPrompt?: ChatPrompt;
|
|
272
|
+
readonly systemMessage?: string;
|
|
273
|
+
}
|
|
274
|
+
declare function buildPromptInputs(testCase: EvalCase): Promise<PromptInputs>;
|
|
275
|
+
|
|
277
276
|
/**
|
|
278
277
|
* Extract fenced code blocks from AgentV user segments.
|
|
279
278
|
*/
|
|
280
279
|
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
280
|
+
|
|
281
|
+
/**
|
|
282
|
+
* Determine whether a path references guideline content (instructions or prompts).
|
|
283
|
+
*/
|
|
284
|
+
declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
|
|
285
|
+
|
|
281
286
|
type LoadOptions = {
|
|
282
287
|
readonly verbose?: boolean;
|
|
283
288
|
readonly evalId?: string;
|
|
284
289
|
};
|
|
285
290
|
/**
|
|
286
|
-
*
|
|
291
|
+
* Read metadata from a test suite file (like target name).
|
|
292
|
+
* This is a convenience function for CLI tools that need metadata without loading all eval cases.
|
|
287
293
|
*/
|
|
288
|
-
declare function
|
|
294
|
+
declare function readTestSuiteMetadata(testFilePath: string): Promise<{
|
|
295
|
+
target?: string;
|
|
296
|
+
}>;
|
|
289
297
|
/**
|
|
290
|
-
*
|
|
298
|
+
* Load eval cases from a AgentV YAML specification file.
|
|
291
299
|
*/
|
|
292
|
-
|
|
293
|
-
readonly question: string;
|
|
294
|
-
readonly guidelines: string;
|
|
295
|
-
readonly chatPrompt?: ChatPrompt;
|
|
296
|
-
readonly systemMessage?: string;
|
|
297
|
-
}
|
|
298
|
-
declare function buildPromptInputs(testCase: EvalCase): Promise<PromptInputs>;
|
|
300
|
+
declare function loadEvalCases(evalFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<readonly EvalCase[]>;
|
|
299
301
|
|
|
300
302
|
declare function fileExists(filePath: string): Promise<boolean>;
|
|
301
303
|
/**
|
|
@@ -338,6 +340,9 @@ interface RetryConfig {
|
|
|
338
340
|
readonly backoffFactor?: number;
|
|
339
341
|
readonly retryableStatusCodes?: readonly number[];
|
|
340
342
|
}
|
|
343
|
+
/**
|
|
344
|
+
* Azure OpenAI settings used by the Vercel AI SDK.
|
|
345
|
+
*/
|
|
341
346
|
interface AzureResolvedConfig {
|
|
342
347
|
readonly resourceName: string;
|
|
343
348
|
readonly deploymentName: string;
|
|
@@ -347,6 +352,9 @@ interface AzureResolvedConfig {
|
|
|
347
352
|
readonly maxOutputTokens?: number;
|
|
348
353
|
readonly retry?: RetryConfig;
|
|
349
354
|
}
|
|
355
|
+
/**
|
|
356
|
+
* Anthropic Claude settings used by the Vercel AI SDK.
|
|
357
|
+
*/
|
|
350
358
|
interface AnthropicResolvedConfig {
|
|
351
359
|
readonly apiKey: string;
|
|
352
360
|
readonly model: string;
|
|
@@ -355,6 +363,9 @@ interface AnthropicResolvedConfig {
|
|
|
355
363
|
readonly thinkingBudget?: number;
|
|
356
364
|
readonly retry?: RetryConfig;
|
|
357
365
|
}
|
|
366
|
+
/**
|
|
367
|
+
* Google Gemini settings used by the Vercel AI SDK.
|
|
368
|
+
*/
|
|
358
369
|
interface GeminiResolvedConfig {
|
|
359
370
|
readonly apiKey: string;
|
|
360
371
|
readonly model: string;
|
|
@@ -399,6 +410,7 @@ interface CliResolvedConfig {
|
|
|
399
410
|
readonly cwd?: string;
|
|
400
411
|
readonly timeoutMs?: number;
|
|
401
412
|
readonly healthcheck?: CliHealthcheck;
|
|
413
|
+
readonly verbose?: boolean;
|
|
402
414
|
}
|
|
403
415
|
type ResolvedTarget = {
|
|
404
416
|
readonly kind: "azure";
|
|
@@ -500,7 +512,7 @@ interface EvaluationContext {
|
|
|
500
512
|
};
|
|
501
513
|
readonly now: Date;
|
|
502
514
|
readonly judgeProvider?: Provider;
|
|
503
|
-
readonly
|
|
515
|
+
readonly evaluatorTemplateOverride?: string;
|
|
504
516
|
readonly evaluator?: EvaluatorConfig;
|
|
505
517
|
}
|
|
506
518
|
interface EvaluationScore {
|
|
@@ -521,14 +533,14 @@ interface LlmJudgeEvaluatorOptions {
|
|
|
521
533
|
readonly resolveJudgeProvider: JudgeProviderResolver;
|
|
522
534
|
readonly maxOutputTokens?: number;
|
|
523
535
|
readonly temperature?: number;
|
|
524
|
-
readonly
|
|
536
|
+
readonly evaluatorTemplate?: string;
|
|
525
537
|
}
|
|
526
538
|
declare class LlmJudgeEvaluator implements Evaluator {
|
|
527
539
|
readonly kind = "llm_judge";
|
|
528
540
|
private readonly resolveJudgeProvider;
|
|
529
541
|
private readonly maxOutputTokens?;
|
|
530
542
|
private readonly temperature?;
|
|
531
|
-
private readonly
|
|
543
|
+
private readonly evaluatorTemplate?;
|
|
532
544
|
constructor(options: LlmJudgeEvaluatorOptions);
|
|
533
545
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
534
546
|
private evaluateWithPrompt;
|
package/dist/index.d.ts
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import { AxChatRequest, AxAI } from '@ax-llm/ax';
|
|
2
|
-
|
|
3
1
|
/**
|
|
4
2
|
* JSON primitive values appearing in AgentV payloads.
|
|
5
3
|
*/
|
|
@@ -117,6 +115,7 @@ interface EvalCase {
|
|
|
117
115
|
* Evaluator scorecard for a single eval case run.
|
|
118
116
|
*/
|
|
119
117
|
interface EvaluationResult {
|
|
118
|
+
readonly timestamp: string;
|
|
120
119
|
readonly eval_id: string;
|
|
121
120
|
readonly dataset?: string;
|
|
122
121
|
readonly conversation_id?: string;
|
|
@@ -124,14 +123,12 @@ interface EvaluationResult {
|
|
|
124
123
|
readonly hits: readonly string[];
|
|
125
124
|
readonly misses: readonly string[];
|
|
126
125
|
readonly candidate_answer: string;
|
|
127
|
-
readonly expected_aspect_count: number;
|
|
128
126
|
readonly target: string;
|
|
129
|
-
readonly timestamp: string;
|
|
130
127
|
readonly reasoning?: string;
|
|
131
128
|
readonly raw_aspects?: readonly string[];
|
|
132
129
|
readonly agent_provider_request?: JsonObject;
|
|
133
130
|
readonly lm_provider_request?: JsonObject;
|
|
134
|
-
readonly
|
|
131
|
+
readonly evaluator_provider_request?: JsonObject;
|
|
135
132
|
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
136
133
|
readonly error?: string;
|
|
137
134
|
}
|
|
@@ -143,17 +140,24 @@ interface EvaluatorResult {
|
|
|
143
140
|
readonly misses: readonly string[];
|
|
144
141
|
readonly reasoning?: string;
|
|
145
142
|
readonly raw_request?: JsonObject;
|
|
146
|
-
readonly
|
|
143
|
+
readonly evaluator_provider_request?: JsonObject;
|
|
147
144
|
}
|
|
148
145
|
/**
|
|
149
146
|
* Convenience accessor matching the Python hit_count property.
|
|
150
147
|
*/
|
|
151
148
|
declare function getHitCount(result: Pick<EvaluationResult, "hits">): number;
|
|
152
149
|
|
|
153
|
-
type
|
|
150
|
+
type ChatMessageRole = "system" | "user" | "assistant" | "tool" | "function";
|
|
151
|
+
interface ChatMessage {
|
|
152
|
+
readonly role: ChatMessageRole;
|
|
153
|
+
readonly content: string;
|
|
154
|
+
readonly name?: string;
|
|
155
|
+
}
|
|
156
|
+
type ChatPrompt = readonly ChatMessage[];
|
|
154
157
|
type ProviderKind = "azure" | "anthropic" | "gemini" | "codex" | "cli" | "mock" | "vscode" | "vscode-insiders";
|
|
155
158
|
interface ProviderRequest {
|
|
156
159
|
readonly question: string;
|
|
160
|
+
readonly systemPrompt?: string;
|
|
157
161
|
readonly guidelines?: string;
|
|
158
162
|
readonly guideline_patterns?: readonly string[];
|
|
159
163
|
readonly chatPrompt?: ChatPrompt;
|
|
@@ -185,11 +189,6 @@ interface Provider {
|
|
|
185
189
|
* the orchestrator may send multiple requests in a single provider session.
|
|
186
190
|
*/
|
|
187
191
|
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
188
|
-
/**
|
|
189
|
-
* Optional access to the underlying AxAI instance.
|
|
190
|
-
* This enables using advanced Ax features like structured output signatures.
|
|
191
|
-
*/
|
|
192
|
-
getAxAI?(): AxAI;
|
|
193
192
|
}
|
|
194
193
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
195
194
|
interface TargetDefinition {
|
|
@@ -264,38 +263,41 @@ interface TargetDefinition {
|
|
|
264
263
|
}
|
|
265
264
|
|
|
266
265
|
/**
|
|
267
|
-
*
|
|
268
|
-
* This is a convenience function for CLI tools that need metadata without loading all eval cases.
|
|
269
|
-
*/
|
|
270
|
-
declare function readTestSuiteMetadata(testFilePath: string): Promise<{
|
|
271
|
-
target?: string;
|
|
272
|
-
}>;
|
|
273
|
-
/**
|
|
274
|
-
* Determine whether a path references guideline content (instructions or prompts).
|
|
266
|
+
* Build prompt inputs by consolidating user request context and guideline content.
|
|
275
267
|
*/
|
|
276
|
-
|
|
268
|
+
interface PromptInputs {
|
|
269
|
+
readonly question: string;
|
|
270
|
+
readonly guidelines: string;
|
|
271
|
+
readonly chatPrompt?: ChatPrompt;
|
|
272
|
+
readonly systemMessage?: string;
|
|
273
|
+
}
|
|
274
|
+
declare function buildPromptInputs(testCase: EvalCase): Promise<PromptInputs>;
|
|
275
|
+
|
|
277
276
|
/**
|
|
278
277
|
* Extract fenced code blocks from AgentV user segments.
|
|
279
278
|
*/
|
|
280
279
|
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
280
|
+
|
|
281
|
+
/**
|
|
282
|
+
* Determine whether a path references guideline content (instructions or prompts).
|
|
283
|
+
*/
|
|
284
|
+
declare function isGuidelineFile(filePath: string, patterns?: readonly string[]): boolean;
|
|
285
|
+
|
|
281
286
|
type LoadOptions = {
|
|
282
287
|
readonly verbose?: boolean;
|
|
283
288
|
readonly evalId?: string;
|
|
284
289
|
};
|
|
285
290
|
/**
|
|
286
|
-
*
|
|
291
|
+
* Read metadata from a test suite file (like target name).
|
|
292
|
+
* This is a convenience function for CLI tools that need metadata without loading all eval cases.
|
|
287
293
|
*/
|
|
288
|
-
declare function
|
|
294
|
+
declare function readTestSuiteMetadata(testFilePath: string): Promise<{
|
|
295
|
+
target?: string;
|
|
296
|
+
}>;
|
|
289
297
|
/**
|
|
290
|
-
*
|
|
298
|
+
* Load eval cases from a AgentV YAML specification file.
|
|
291
299
|
*/
|
|
292
|
-
|
|
293
|
-
readonly question: string;
|
|
294
|
-
readonly guidelines: string;
|
|
295
|
-
readonly chatPrompt?: ChatPrompt;
|
|
296
|
-
readonly systemMessage?: string;
|
|
297
|
-
}
|
|
298
|
-
declare function buildPromptInputs(testCase: EvalCase): Promise<PromptInputs>;
|
|
300
|
+
declare function loadEvalCases(evalFilePath: string, repoRoot: URL | string, options?: LoadOptions): Promise<readonly EvalCase[]>;
|
|
299
301
|
|
|
300
302
|
declare function fileExists(filePath: string): Promise<boolean>;
|
|
301
303
|
/**
|
|
@@ -338,6 +340,9 @@ interface RetryConfig {
|
|
|
338
340
|
readonly backoffFactor?: number;
|
|
339
341
|
readonly retryableStatusCodes?: readonly number[];
|
|
340
342
|
}
|
|
343
|
+
/**
|
|
344
|
+
* Azure OpenAI settings used by the Vercel AI SDK.
|
|
345
|
+
*/
|
|
341
346
|
interface AzureResolvedConfig {
|
|
342
347
|
readonly resourceName: string;
|
|
343
348
|
readonly deploymentName: string;
|
|
@@ -347,6 +352,9 @@ interface AzureResolvedConfig {
|
|
|
347
352
|
readonly maxOutputTokens?: number;
|
|
348
353
|
readonly retry?: RetryConfig;
|
|
349
354
|
}
|
|
355
|
+
/**
|
|
356
|
+
* Anthropic Claude settings used by the Vercel AI SDK.
|
|
357
|
+
*/
|
|
350
358
|
interface AnthropicResolvedConfig {
|
|
351
359
|
readonly apiKey: string;
|
|
352
360
|
readonly model: string;
|
|
@@ -355,6 +363,9 @@ interface AnthropicResolvedConfig {
|
|
|
355
363
|
readonly thinkingBudget?: number;
|
|
356
364
|
readonly retry?: RetryConfig;
|
|
357
365
|
}
|
|
366
|
+
/**
|
|
367
|
+
* Google Gemini settings used by the Vercel AI SDK.
|
|
368
|
+
*/
|
|
358
369
|
interface GeminiResolvedConfig {
|
|
359
370
|
readonly apiKey: string;
|
|
360
371
|
readonly model: string;
|
|
@@ -399,6 +410,7 @@ interface CliResolvedConfig {
|
|
|
399
410
|
readonly cwd?: string;
|
|
400
411
|
readonly timeoutMs?: number;
|
|
401
412
|
readonly healthcheck?: CliHealthcheck;
|
|
413
|
+
readonly verbose?: boolean;
|
|
402
414
|
}
|
|
403
415
|
type ResolvedTarget = {
|
|
404
416
|
readonly kind: "azure";
|
|
@@ -500,7 +512,7 @@ interface EvaluationContext {
|
|
|
500
512
|
};
|
|
501
513
|
readonly now: Date;
|
|
502
514
|
readonly judgeProvider?: Provider;
|
|
503
|
-
readonly
|
|
515
|
+
readonly evaluatorTemplateOverride?: string;
|
|
504
516
|
readonly evaluator?: EvaluatorConfig;
|
|
505
517
|
}
|
|
506
518
|
interface EvaluationScore {
|
|
@@ -521,14 +533,14 @@ interface LlmJudgeEvaluatorOptions {
|
|
|
521
533
|
readonly resolveJudgeProvider: JudgeProviderResolver;
|
|
522
534
|
readonly maxOutputTokens?: number;
|
|
523
535
|
readonly temperature?: number;
|
|
524
|
-
readonly
|
|
536
|
+
readonly evaluatorTemplate?: string;
|
|
525
537
|
}
|
|
526
538
|
declare class LlmJudgeEvaluator implements Evaluator {
|
|
527
539
|
readonly kind = "llm_judge";
|
|
528
540
|
private readonly resolveJudgeProvider;
|
|
529
541
|
private readonly maxOutputTokens?;
|
|
530
542
|
private readonly temperature?;
|
|
531
|
-
private readonly
|
|
543
|
+
private readonly evaluatorTemplate?;
|
|
532
544
|
constructor(options: LlmJudgeEvaluatorOptions);
|
|
533
545
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
534
546
|
private evaluateWithPrompt;
|