@agentv/core 0.20.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-SVY324GN.js → chunk-BO7KG7JX.js} +1 -1
- package/dist/{chunk-SVY324GN.js.map → chunk-BO7KG7JX.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs +4 -4
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +5 -5
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +345 -25
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +49 -3
- package/dist/index.d.ts +49 -3
- package/dist/index.js +344 -26
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/index.d.cts
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import * as ai from 'ai';
|
|
2
|
+
|
|
1
3
|
/**
|
|
2
4
|
* JSON primitive values appearing in AgentV payloads.
|
|
3
5
|
*/
|
|
@@ -73,7 +75,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
73
75
|
* Guard validating raw test messages.
|
|
74
76
|
*/
|
|
75
77
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
76
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge"];
|
|
78
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge", "rubric"];
|
|
77
79
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
78
80
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
79
81
|
type CodeEvaluatorConfig = {
|
|
@@ -90,7 +92,18 @@ type LlmJudgeEvaluatorConfig = {
|
|
|
90
92
|
readonly prompt?: string;
|
|
91
93
|
readonly promptPath?: string;
|
|
92
94
|
};
|
|
93
|
-
type
|
|
95
|
+
type RubricItem = {
|
|
96
|
+
readonly id: string;
|
|
97
|
+
readonly description: string;
|
|
98
|
+
readonly weight: number;
|
|
99
|
+
readonly required: boolean;
|
|
100
|
+
};
|
|
101
|
+
type RubricEvaluatorConfig = {
|
|
102
|
+
readonly name: string;
|
|
103
|
+
readonly type: 'rubric';
|
|
104
|
+
readonly rubrics: readonly RubricItem[];
|
|
105
|
+
};
|
|
106
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | RubricEvaluatorConfig;
|
|
94
107
|
/**
|
|
95
108
|
* Eval case definition sourced from AgentV specs.
|
|
96
109
|
*/
|
|
@@ -132,10 +145,12 @@ interface EvaluationResult {
|
|
|
132
145
|
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
133
146
|
readonly error?: string;
|
|
134
147
|
}
|
|
148
|
+
type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
|
|
135
149
|
interface EvaluatorResult {
|
|
136
150
|
readonly name: string;
|
|
137
151
|
readonly type: EvaluatorKind;
|
|
138
152
|
readonly score: number;
|
|
153
|
+
readonly verdict?: EvaluationVerdict;
|
|
139
154
|
readonly hits: readonly string[];
|
|
140
155
|
readonly misses: readonly string[];
|
|
141
156
|
readonly reasoning?: string;
|
|
@@ -189,6 +204,11 @@ interface Provider {
|
|
|
189
204
|
* the orchestrator may send multiple requests in a single provider session.
|
|
190
205
|
*/
|
|
191
206
|
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
207
|
+
/**
|
|
208
|
+
* Optional method to get a Vercel AI SDK LanguageModel instance for structured output generation.
|
|
209
|
+
* Used by evaluators that need generateObject/generateText from the AI SDK.
|
|
210
|
+
*/
|
|
211
|
+
asLanguageModel?(): ai.LanguageModel;
|
|
192
212
|
}
|
|
193
213
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
194
214
|
interface TargetDefinition {
|
|
@@ -510,6 +530,20 @@ declare function subscribeToCodexLogEntries(listener: CodexLogListener): () => v
|
|
|
510
530
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
511
531
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
512
532
|
|
|
533
|
+
interface RubricEvaluatorOptions {
|
|
534
|
+
readonly config: RubricEvaluatorConfig;
|
|
535
|
+
readonly resolveJudgeProvider: (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
536
|
+
}
|
|
537
|
+
declare class RubricEvaluator implements Evaluator {
|
|
538
|
+
readonly kind = "rubric";
|
|
539
|
+
private readonly config;
|
|
540
|
+
private readonly resolveJudgeProvider;
|
|
541
|
+
constructor(options: RubricEvaluatorOptions);
|
|
542
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
543
|
+
private buildPrompt;
|
|
544
|
+
private calculateScore;
|
|
545
|
+
}
|
|
546
|
+
|
|
513
547
|
interface EvaluationContext {
|
|
514
548
|
readonly evalCase: EvalCase;
|
|
515
549
|
readonly candidate: string;
|
|
@@ -529,6 +563,7 @@ interface EvaluationContext {
|
|
|
529
563
|
}
|
|
530
564
|
interface EvaluationScore {
|
|
531
565
|
readonly score: number;
|
|
566
|
+
readonly verdict?: EvaluationVerdict;
|
|
532
567
|
readonly hits: readonly string[];
|
|
533
568
|
readonly misses: readonly string[];
|
|
534
569
|
readonly expectedAspectCount: number;
|
|
@@ -624,9 +659,20 @@ interface RunEvaluationOptions {
|
|
|
624
659
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
625
660
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
626
661
|
|
|
662
|
+
interface GenerateRubricsOptions {
|
|
663
|
+
readonly expectedOutcome: string;
|
|
664
|
+
readonly question?: string;
|
|
665
|
+
readonly referenceAnswer?: string;
|
|
666
|
+
readonly provider: Provider;
|
|
667
|
+
}
|
|
668
|
+
/**
|
|
669
|
+
* Generate rubrics from expected outcome using an LLM.
|
|
670
|
+
*/
|
|
671
|
+
declare function generateRubrics(options: GenerateRubricsOptions): Promise<readonly RubricItem[]>;
|
|
672
|
+
|
|
627
673
|
type AgentKernel = {
|
|
628
674
|
status: string;
|
|
629
675
|
};
|
|
630
676
|
declare function createAgentKernel(): AgentKernel;
|
|
631
677
|
|
|
632
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
|
678
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, RubricEvaluator, type RubricEvaluatorConfig, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import * as ai from 'ai';
|
|
2
|
+
|
|
1
3
|
/**
|
|
2
4
|
* JSON primitive values appearing in AgentV payloads.
|
|
3
5
|
*/
|
|
@@ -73,7 +75,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
73
75
|
* Guard validating raw test messages.
|
|
74
76
|
*/
|
|
75
77
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
76
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge"];
|
|
78
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge", "rubric"];
|
|
77
79
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
78
80
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
79
81
|
type CodeEvaluatorConfig = {
|
|
@@ -90,7 +92,18 @@ type LlmJudgeEvaluatorConfig = {
|
|
|
90
92
|
readonly prompt?: string;
|
|
91
93
|
readonly promptPath?: string;
|
|
92
94
|
};
|
|
93
|
-
type
|
|
95
|
+
type RubricItem = {
|
|
96
|
+
readonly id: string;
|
|
97
|
+
readonly description: string;
|
|
98
|
+
readonly weight: number;
|
|
99
|
+
readonly required: boolean;
|
|
100
|
+
};
|
|
101
|
+
type RubricEvaluatorConfig = {
|
|
102
|
+
readonly name: string;
|
|
103
|
+
readonly type: 'rubric';
|
|
104
|
+
readonly rubrics: readonly RubricItem[];
|
|
105
|
+
};
|
|
106
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | RubricEvaluatorConfig;
|
|
94
107
|
/**
|
|
95
108
|
* Eval case definition sourced from AgentV specs.
|
|
96
109
|
*/
|
|
@@ -132,10 +145,12 @@ interface EvaluationResult {
|
|
|
132
145
|
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
133
146
|
readonly error?: string;
|
|
134
147
|
}
|
|
148
|
+
type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
|
|
135
149
|
interface EvaluatorResult {
|
|
136
150
|
readonly name: string;
|
|
137
151
|
readonly type: EvaluatorKind;
|
|
138
152
|
readonly score: number;
|
|
153
|
+
readonly verdict?: EvaluationVerdict;
|
|
139
154
|
readonly hits: readonly string[];
|
|
140
155
|
readonly misses: readonly string[];
|
|
141
156
|
readonly reasoning?: string;
|
|
@@ -189,6 +204,11 @@ interface Provider {
|
|
|
189
204
|
* the orchestrator may send multiple requests in a single provider session.
|
|
190
205
|
*/
|
|
191
206
|
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
207
|
+
/**
|
|
208
|
+
* Optional method to get a Vercel AI SDK LanguageModel instance for structured output generation.
|
|
209
|
+
* Used by evaluators that need generateObject/generateText from the AI SDK.
|
|
210
|
+
*/
|
|
211
|
+
asLanguageModel?(): ai.LanguageModel;
|
|
192
212
|
}
|
|
193
213
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
194
214
|
interface TargetDefinition {
|
|
@@ -510,6 +530,20 @@ declare function subscribeToCodexLogEntries(listener: CodexLogListener): () => v
|
|
|
510
530
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
511
531
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
512
532
|
|
|
533
|
+
interface RubricEvaluatorOptions {
|
|
534
|
+
readonly config: RubricEvaluatorConfig;
|
|
535
|
+
readonly resolveJudgeProvider: (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
536
|
+
}
|
|
537
|
+
declare class RubricEvaluator implements Evaluator {
|
|
538
|
+
readonly kind = "rubric";
|
|
539
|
+
private readonly config;
|
|
540
|
+
private readonly resolveJudgeProvider;
|
|
541
|
+
constructor(options: RubricEvaluatorOptions);
|
|
542
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
543
|
+
private buildPrompt;
|
|
544
|
+
private calculateScore;
|
|
545
|
+
}
|
|
546
|
+
|
|
513
547
|
interface EvaluationContext {
|
|
514
548
|
readonly evalCase: EvalCase;
|
|
515
549
|
readonly candidate: string;
|
|
@@ -529,6 +563,7 @@ interface EvaluationContext {
|
|
|
529
563
|
}
|
|
530
564
|
interface EvaluationScore {
|
|
531
565
|
readonly score: number;
|
|
566
|
+
readonly verdict?: EvaluationVerdict;
|
|
532
567
|
readonly hits: readonly string[];
|
|
533
568
|
readonly misses: readonly string[];
|
|
534
569
|
readonly expectedAspectCount: number;
|
|
@@ -624,9 +659,20 @@ interface RunEvaluationOptions {
|
|
|
624
659
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
625
660
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
626
661
|
|
|
662
|
+
interface GenerateRubricsOptions {
|
|
663
|
+
readonly expectedOutcome: string;
|
|
664
|
+
readonly question?: string;
|
|
665
|
+
readonly referenceAnswer?: string;
|
|
666
|
+
readonly provider: Provider;
|
|
667
|
+
}
|
|
668
|
+
/**
|
|
669
|
+
* Generate rubrics from expected outcome using an LLM.
|
|
670
|
+
*/
|
|
671
|
+
declare function generateRubrics(options: GenerateRubricsOptions): Promise<readonly RubricItem[]>;
|
|
672
|
+
|
|
627
673
|
type AgentKernel = {
|
|
628
674
|
status: string;
|
|
629
675
|
};
|
|
630
676
|
declare function createAgentKernel(): AgentKernel;
|
|
631
677
|
|
|
632
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
|
678
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, RubricEvaluator, type RubricEvaluatorConfig, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|