@agentv/core 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-SVY324GN.js → chunk-BO7KG7JX.js} +1 -1
- package/dist/{chunk-SVY324GN.js.map → chunk-BO7KG7JX.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs +4 -4
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +5 -5
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +322 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +49 -3
- package/dist/index.d.ts +49 -3
- package/dist/index.js +321 -3
- package/dist/index.js.map +1 -1
- package/package.json +2 -5
package/dist/index.d.cts
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import * as ai from 'ai';
|
|
2
|
+
|
|
1
3
|
/**
|
|
2
4
|
* JSON primitive values appearing in AgentV payloads.
|
|
3
5
|
*/
|
|
@@ -73,7 +75,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
73
75
|
* Guard validating raw test messages.
|
|
74
76
|
*/
|
|
75
77
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
76
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge"];
|
|
78
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge", "rubric"];
|
|
77
79
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
78
80
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
79
81
|
type CodeEvaluatorConfig = {
|
|
@@ -90,7 +92,18 @@ type LlmJudgeEvaluatorConfig = {
|
|
|
90
92
|
readonly prompt?: string;
|
|
91
93
|
readonly promptPath?: string;
|
|
92
94
|
};
|
|
93
|
-
type
|
|
95
|
+
type RubricItem = {
|
|
96
|
+
readonly id: string;
|
|
97
|
+
readonly description: string;
|
|
98
|
+
readonly weight: number;
|
|
99
|
+
readonly required: boolean;
|
|
100
|
+
};
|
|
101
|
+
type RubricEvaluatorConfig = {
|
|
102
|
+
readonly name: string;
|
|
103
|
+
readonly type: 'rubric';
|
|
104
|
+
readonly rubrics: readonly RubricItem[];
|
|
105
|
+
};
|
|
106
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | RubricEvaluatorConfig;
|
|
94
107
|
/**
|
|
95
108
|
* Eval case definition sourced from AgentV specs.
|
|
96
109
|
*/
|
|
@@ -132,10 +145,12 @@ interface EvaluationResult {
|
|
|
132
145
|
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
133
146
|
readonly error?: string;
|
|
134
147
|
}
|
|
148
|
+
type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
|
|
135
149
|
interface EvaluatorResult {
|
|
136
150
|
readonly name: string;
|
|
137
151
|
readonly type: EvaluatorKind;
|
|
138
152
|
readonly score: number;
|
|
153
|
+
readonly verdict?: EvaluationVerdict;
|
|
139
154
|
readonly hits: readonly string[];
|
|
140
155
|
readonly misses: readonly string[];
|
|
141
156
|
readonly reasoning?: string;
|
|
@@ -189,6 +204,11 @@ interface Provider {
|
|
|
189
204
|
* the orchestrator may send multiple requests in a single provider session.
|
|
190
205
|
*/
|
|
191
206
|
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
207
|
+
/**
|
|
208
|
+
* Optional method to get a Vercel AI SDK LanguageModel instance for structured output generation.
|
|
209
|
+
* Used by evaluators that need generateObject/generateText from the AI SDK.
|
|
210
|
+
*/
|
|
211
|
+
asLanguageModel?(): ai.LanguageModel;
|
|
192
212
|
}
|
|
193
213
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
194
214
|
interface TargetDefinition {
|
|
@@ -510,6 +530,20 @@ declare function subscribeToCodexLogEntries(listener: CodexLogListener): () => v
|
|
|
510
530
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
511
531
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
512
532
|
|
|
533
|
+
interface RubricEvaluatorOptions {
|
|
534
|
+
readonly config: RubricEvaluatorConfig;
|
|
535
|
+
readonly resolveJudgeProvider: (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
536
|
+
}
|
|
537
|
+
declare class RubricEvaluator implements Evaluator {
|
|
538
|
+
readonly kind = "rubric";
|
|
539
|
+
private readonly config;
|
|
540
|
+
private readonly resolveJudgeProvider;
|
|
541
|
+
constructor(options: RubricEvaluatorOptions);
|
|
542
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
543
|
+
private buildPrompt;
|
|
544
|
+
private calculateScore;
|
|
545
|
+
}
|
|
546
|
+
|
|
513
547
|
interface EvaluationContext {
|
|
514
548
|
readonly evalCase: EvalCase;
|
|
515
549
|
readonly candidate: string;
|
|
@@ -529,6 +563,7 @@ interface EvaluationContext {
|
|
|
529
563
|
}
|
|
530
564
|
interface EvaluationScore {
|
|
531
565
|
readonly score: number;
|
|
566
|
+
readonly verdict?: EvaluationVerdict;
|
|
532
567
|
readonly hits: readonly string[];
|
|
533
568
|
readonly misses: readonly string[];
|
|
534
569
|
readonly expectedAspectCount: number;
|
|
@@ -624,9 +659,20 @@ interface RunEvaluationOptions {
|
|
|
624
659
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
625
660
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
626
661
|
|
|
662
|
+
interface GenerateRubricsOptions {
|
|
663
|
+
readonly expectedOutcome: string;
|
|
664
|
+
readonly question?: string;
|
|
665
|
+
readonly referenceAnswer?: string;
|
|
666
|
+
readonly provider: Provider;
|
|
667
|
+
}
|
|
668
|
+
/**
|
|
669
|
+
* Generate rubrics from expected outcome using an LLM.
|
|
670
|
+
*/
|
|
671
|
+
declare function generateRubrics(options: GenerateRubricsOptions): Promise<readonly RubricItem[]>;
|
|
672
|
+
|
|
627
673
|
type AgentKernel = {
|
|
628
674
|
status: string;
|
|
629
675
|
};
|
|
630
676
|
declare function createAgentKernel(): AgentKernel;
|
|
631
677
|
|
|
632
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
|
678
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, RubricEvaluator, type RubricEvaluatorConfig, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import * as ai from 'ai';
|
|
2
|
+
|
|
1
3
|
/**
|
|
2
4
|
* JSON primitive values appearing in AgentV payloads.
|
|
3
5
|
*/
|
|
@@ -73,7 +75,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
73
75
|
* Guard validating raw test messages.
|
|
74
76
|
*/
|
|
75
77
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
76
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge"];
|
|
78
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge", "rubric"];
|
|
77
79
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
78
80
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
79
81
|
type CodeEvaluatorConfig = {
|
|
@@ -90,7 +92,18 @@ type LlmJudgeEvaluatorConfig = {
|
|
|
90
92
|
readonly prompt?: string;
|
|
91
93
|
readonly promptPath?: string;
|
|
92
94
|
};
|
|
93
|
-
type
|
|
95
|
+
type RubricItem = {
|
|
96
|
+
readonly id: string;
|
|
97
|
+
readonly description: string;
|
|
98
|
+
readonly weight: number;
|
|
99
|
+
readonly required: boolean;
|
|
100
|
+
};
|
|
101
|
+
type RubricEvaluatorConfig = {
|
|
102
|
+
readonly name: string;
|
|
103
|
+
readonly type: 'rubric';
|
|
104
|
+
readonly rubrics: readonly RubricItem[];
|
|
105
|
+
};
|
|
106
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | RubricEvaluatorConfig;
|
|
94
107
|
/**
|
|
95
108
|
* Eval case definition sourced from AgentV specs.
|
|
96
109
|
*/
|
|
@@ -132,10 +145,12 @@ interface EvaluationResult {
|
|
|
132
145
|
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
133
146
|
readonly error?: string;
|
|
134
147
|
}
|
|
148
|
+
type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
|
|
135
149
|
interface EvaluatorResult {
|
|
136
150
|
readonly name: string;
|
|
137
151
|
readonly type: EvaluatorKind;
|
|
138
152
|
readonly score: number;
|
|
153
|
+
readonly verdict?: EvaluationVerdict;
|
|
139
154
|
readonly hits: readonly string[];
|
|
140
155
|
readonly misses: readonly string[];
|
|
141
156
|
readonly reasoning?: string;
|
|
@@ -189,6 +204,11 @@ interface Provider {
|
|
|
189
204
|
* the orchestrator may send multiple requests in a single provider session.
|
|
190
205
|
*/
|
|
191
206
|
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
207
|
+
/**
|
|
208
|
+
* Optional method to get a Vercel AI SDK LanguageModel instance for structured output generation.
|
|
209
|
+
* Used by evaluators that need generateObject/generateText from the AI SDK.
|
|
210
|
+
*/
|
|
211
|
+
asLanguageModel?(): ai.LanguageModel;
|
|
192
212
|
}
|
|
193
213
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
194
214
|
interface TargetDefinition {
|
|
@@ -510,6 +530,20 @@ declare function subscribeToCodexLogEntries(listener: CodexLogListener): () => v
|
|
|
510
530
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
511
531
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
512
532
|
|
|
533
|
+
interface RubricEvaluatorOptions {
|
|
534
|
+
readonly config: RubricEvaluatorConfig;
|
|
535
|
+
readonly resolveJudgeProvider: (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
536
|
+
}
|
|
537
|
+
declare class RubricEvaluator implements Evaluator {
|
|
538
|
+
readonly kind = "rubric";
|
|
539
|
+
private readonly config;
|
|
540
|
+
private readonly resolveJudgeProvider;
|
|
541
|
+
constructor(options: RubricEvaluatorOptions);
|
|
542
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
543
|
+
private buildPrompt;
|
|
544
|
+
private calculateScore;
|
|
545
|
+
}
|
|
546
|
+
|
|
513
547
|
interface EvaluationContext {
|
|
514
548
|
readonly evalCase: EvalCase;
|
|
515
549
|
readonly candidate: string;
|
|
@@ -529,6 +563,7 @@ interface EvaluationContext {
|
|
|
529
563
|
}
|
|
530
564
|
interface EvaluationScore {
|
|
531
565
|
readonly score: number;
|
|
566
|
+
readonly verdict?: EvaluationVerdict;
|
|
532
567
|
readonly hits: readonly string[];
|
|
533
568
|
readonly misses: readonly string[];
|
|
534
569
|
readonly expectedAspectCount: number;
|
|
@@ -624,9 +659,20 @@ interface RunEvaluationOptions {
|
|
|
624
659
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
625
660
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
626
661
|
|
|
662
|
+
interface GenerateRubricsOptions {
|
|
663
|
+
readonly expectedOutcome: string;
|
|
664
|
+
readonly question?: string;
|
|
665
|
+
readonly referenceAnswer?: string;
|
|
666
|
+
readonly provider: Provider;
|
|
667
|
+
}
|
|
668
|
+
/**
|
|
669
|
+
* Generate rubrics from expected outcome using an LLM.
|
|
670
|
+
*/
|
|
671
|
+
declare function generateRubrics(options: GenerateRubricsOptions): Promise<readonly RubricItem[]>;
|
|
672
|
+
|
|
627
673
|
type AgentKernel = {
|
|
628
674
|
status: string;
|
|
629
675
|
};
|
|
630
676
|
declare function createAgentKernel(): AgentKernel;
|
|
631
677
|
|
|
632
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
|
678
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, RubricEvaluator, type RubricEvaluatorConfig, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
package/dist/index.js
CHANGED
|
@@ -8,7 +8,7 @@ import {
|
|
|
8
8
|
readTextFile,
|
|
9
9
|
resolveFileReference,
|
|
10
10
|
resolveTargetDefinition
|
|
11
|
-
} from "./chunk-
|
|
11
|
+
} from "./chunk-BO7KG7JX.js";
|
|
12
12
|
|
|
13
13
|
// src/evaluation/types.ts
|
|
14
14
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -51,7 +51,7 @@ function isTestMessage(value) {
|
|
|
51
51
|
}
|
|
52
52
|
return candidate.content.every(isJsonObject);
|
|
53
53
|
}
|
|
54
|
-
var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
|
|
54
|
+
var EVALUATOR_KIND_VALUES = ["code", "llm_judge", "rubric"];
|
|
55
55
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
56
56
|
function isEvaluatorKind(value) {
|
|
57
57
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
@@ -453,6 +453,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
453
453
|
}
|
|
454
454
|
}
|
|
455
455
|
const _model = asString2(rawEvaluator.model);
|
|
456
|
+
if (typeValue === "rubric") {
|
|
457
|
+
const rubrics = rawEvaluator.rubrics;
|
|
458
|
+
if (!Array.isArray(rubrics)) {
|
|
459
|
+
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
|
|
460
|
+
continue;
|
|
461
|
+
}
|
|
462
|
+
const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
463
|
+
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
464
|
+
description: asString2(rubric.description) ?? "",
|
|
465
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
466
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
467
|
+
})).filter((r) => r.description.length > 0);
|
|
468
|
+
if (parsedRubrics.length === 0) {
|
|
469
|
+
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
|
|
470
|
+
continue;
|
|
471
|
+
}
|
|
472
|
+
evaluators.push({
|
|
473
|
+
name,
|
|
474
|
+
type: "rubric",
|
|
475
|
+
rubrics: parsedRubrics
|
|
476
|
+
});
|
|
477
|
+
continue;
|
|
478
|
+
}
|
|
456
479
|
evaluators.push({
|
|
457
480
|
name,
|
|
458
481
|
type: "llm_judge",
|
|
@@ -933,7 +956,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
933
956
|
continue;
|
|
934
957
|
}
|
|
935
958
|
const conversationId = asString5(evalcase.conversation_id);
|
|
936
|
-
const outcome = asString5(evalcase.outcome);
|
|
959
|
+
const outcome = asString5(evalcase.expected_outcome) ?? asString5(evalcase.outcome);
|
|
937
960
|
const inputMessagesValue = evalcase.input_messages;
|
|
938
961
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
939
962
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
@@ -987,6 +1010,33 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
987
1010
|
logError(`Skipping eval case '${id}': ${message}`);
|
|
988
1011
|
continue;
|
|
989
1012
|
}
|
|
1013
|
+
const inlineRubrics = evalcase.rubrics;
|
|
1014
|
+
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
1015
|
+
const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
|
|
1016
|
+
if (typeof rubric === "string") {
|
|
1017
|
+
return {
|
|
1018
|
+
id: `rubric-${index + 1}`,
|
|
1019
|
+
description: rubric,
|
|
1020
|
+
weight: 1,
|
|
1021
|
+
required: true
|
|
1022
|
+
};
|
|
1023
|
+
}
|
|
1024
|
+
return {
|
|
1025
|
+
id: asString5(rubric.id) ?? `rubric-${index + 1}`,
|
|
1026
|
+
description: asString5(rubric.description) ?? "",
|
|
1027
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
1028
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
1029
|
+
};
|
|
1030
|
+
}).filter((r) => r.description.length > 0);
|
|
1031
|
+
if (rubricItems.length > 0) {
|
|
1032
|
+
const rubricEvaluator = {
|
|
1033
|
+
name: "rubric",
|
|
1034
|
+
type: "rubric",
|
|
1035
|
+
rubrics: rubricItems
|
|
1036
|
+
};
|
|
1037
|
+
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
990
1040
|
const userFilePaths = [];
|
|
991
1041
|
for (const segment of inputSegments) {
|
|
992
1042
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -1085,6 +1135,9 @@ var AzureProvider = class {
|
|
|
1085
1135
|
retryConfig: this.retryConfig
|
|
1086
1136
|
});
|
|
1087
1137
|
}
|
|
1138
|
+
asLanguageModel() {
|
|
1139
|
+
return this.model;
|
|
1140
|
+
}
|
|
1088
1141
|
};
|
|
1089
1142
|
var AnthropicProvider = class {
|
|
1090
1143
|
constructor(targetName, config) {
|
|
@@ -1118,6 +1171,9 @@ var AnthropicProvider = class {
|
|
|
1118
1171
|
providerOptions
|
|
1119
1172
|
});
|
|
1120
1173
|
}
|
|
1174
|
+
asLanguageModel() {
|
|
1175
|
+
return this.model;
|
|
1176
|
+
}
|
|
1121
1177
|
};
|
|
1122
1178
|
var GeminiProvider = class {
|
|
1123
1179
|
constructor(targetName, config) {
|
|
@@ -1148,6 +1204,9 @@ var GeminiProvider = class {
|
|
|
1148
1204
|
retryConfig: this.retryConfig
|
|
1149
1205
|
});
|
|
1150
1206
|
}
|
|
1207
|
+
asLanguageModel() {
|
|
1208
|
+
return this.model;
|
|
1209
|
+
}
|
|
1151
1210
|
};
|
|
1152
1211
|
function buildAzureOptions(config) {
|
|
1153
1212
|
const options = {
|
|
@@ -2869,6 +2928,148 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
2869
2928
|
return createProvider(resolved);
|
|
2870
2929
|
}
|
|
2871
2930
|
|
|
2931
|
+
// src/evaluation/evaluators/rubric-evaluator.ts
|
|
2932
|
+
import { generateText as generateText2 } from "ai";
|
|
2933
|
+
import { z } from "zod";
|
|
2934
|
+
var rubricCheckResultSchema = z.object({
|
|
2935
|
+
id: z.string().describe("The ID of the rubric item being checked"),
|
|
2936
|
+
satisfied: z.boolean().describe("Whether this rubric requirement is met"),
|
|
2937
|
+
reasoning: z.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
2938
|
+
});
|
|
2939
|
+
var rubricEvaluationSchema = z.object({
|
|
2940
|
+
checks: z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
2941
|
+
overall_reasoning: z.string().describe("Overall assessment summary (1-2 sentences)")
|
|
2942
|
+
});
|
|
2943
|
+
var RubricEvaluator = class {
|
|
2944
|
+
kind = "rubric";
|
|
2945
|
+
config;
|
|
2946
|
+
resolveJudgeProvider;
|
|
2947
|
+
constructor(options) {
|
|
2948
|
+
this.config = options.config;
|
|
2949
|
+
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
2950
|
+
}
|
|
2951
|
+
async evaluate(context) {
|
|
2952
|
+
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
2953
|
+
if (!judgeProvider) {
|
|
2954
|
+
throw new Error("No judge provider available for rubric evaluation");
|
|
2955
|
+
}
|
|
2956
|
+
if (!this.config.rubrics || this.config.rubrics.length === 0) {
|
|
2957
|
+
throw new Error(
|
|
2958
|
+
`No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
|
|
2959
|
+
);
|
|
2960
|
+
}
|
|
2961
|
+
const prompt = this.buildPrompt(context, this.config.rubrics);
|
|
2962
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
2963
|
+
if (!model) {
|
|
2964
|
+
throw new Error("Judge provider does not support language model interface");
|
|
2965
|
+
}
|
|
2966
|
+
const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
2967
|
+
You must return a valid JSON object matching this schema:
|
|
2968
|
+
{
|
|
2969
|
+
"checks": [
|
|
2970
|
+
{
|
|
2971
|
+
"id": "string (rubric id)",
|
|
2972
|
+
"satisfied": boolean,
|
|
2973
|
+
"reasoning": "string (brief explanation)"
|
|
2974
|
+
}
|
|
2975
|
+
],
|
|
2976
|
+
"overall_reasoning": "string (summary)"
|
|
2977
|
+
}`;
|
|
2978
|
+
let result;
|
|
2979
|
+
let lastError;
|
|
2980
|
+
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
2981
|
+
try {
|
|
2982
|
+
const { text } = await generateText2({
|
|
2983
|
+
model,
|
|
2984
|
+
system,
|
|
2985
|
+
prompt
|
|
2986
|
+
});
|
|
2987
|
+
const cleaned = text.replace(/```json\n?|```/g, "").trim();
|
|
2988
|
+
result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
|
|
2989
|
+
break;
|
|
2990
|
+
} catch (e) {
|
|
2991
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
2992
|
+
}
|
|
2993
|
+
}
|
|
2994
|
+
if (!result) {
|
|
2995
|
+
throw new Error(
|
|
2996
|
+
`Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
|
|
2997
|
+
);
|
|
2998
|
+
}
|
|
2999
|
+
const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
|
|
3000
|
+
return {
|
|
3001
|
+
score,
|
|
3002
|
+
verdict,
|
|
3003
|
+
hits,
|
|
3004
|
+
misses,
|
|
3005
|
+
expectedAspectCount: this.config.rubrics.length,
|
|
3006
|
+
reasoning: result.overall_reasoning,
|
|
3007
|
+
evaluatorRawRequest: {
|
|
3008
|
+
prompt
|
|
3009
|
+
}
|
|
3010
|
+
};
|
|
3011
|
+
}
|
|
3012
|
+
buildPrompt(context, rubrics) {
|
|
3013
|
+
const parts = [
|
|
3014
|
+
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
3015
|
+
"",
|
|
3016
|
+
"[[ ## question ## ]]",
|
|
3017
|
+
context.evalCase.question,
|
|
3018
|
+
"",
|
|
3019
|
+
"[[ ## expected_outcome ## ]]",
|
|
3020
|
+
context.evalCase.expected_outcome,
|
|
3021
|
+
""
|
|
3022
|
+
];
|
|
3023
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
3024
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
3025
|
+
}
|
|
3026
|
+
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
3027
|
+
for (const rubric of rubrics) {
|
|
3028
|
+
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
3029
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
3030
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
3031
|
+
}
|
|
3032
|
+
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
3033
|
+
return parts.join("\n");
|
|
3034
|
+
}
|
|
3035
|
+
calculateScore(result, rubrics) {
|
|
3036
|
+
const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
|
|
3037
|
+
const hits = [];
|
|
3038
|
+
const misses = [];
|
|
3039
|
+
let totalWeight = 0;
|
|
3040
|
+
let earnedWeight = 0;
|
|
3041
|
+
let failedRequired = false;
|
|
3042
|
+
for (const check of result.checks) {
|
|
3043
|
+
const rubric = rubricMap.get(check.id);
|
|
3044
|
+
if (!rubric) {
|
|
3045
|
+
continue;
|
|
3046
|
+
}
|
|
3047
|
+
totalWeight += rubric.weight;
|
|
3048
|
+
if (check.satisfied) {
|
|
3049
|
+
earnedWeight += rubric.weight;
|
|
3050
|
+
hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3051
|
+
} else {
|
|
3052
|
+
misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3053
|
+
if (rubric.required) {
|
|
3054
|
+
failedRequired = true;
|
|
3055
|
+
}
|
|
3056
|
+
}
|
|
3057
|
+
}
|
|
3058
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
3059
|
+
let verdict;
|
|
3060
|
+
if (failedRequired) {
|
|
3061
|
+
verdict = "fail";
|
|
3062
|
+
} else if (score >= 0.8) {
|
|
3063
|
+
verdict = "pass";
|
|
3064
|
+
} else if (score >= 0.6) {
|
|
3065
|
+
verdict = "borderline";
|
|
3066
|
+
} else {
|
|
3067
|
+
verdict = "fail";
|
|
3068
|
+
}
|
|
3069
|
+
return { score, verdict, hits, misses };
|
|
3070
|
+
}
|
|
3071
|
+
};
|
|
3072
|
+
|
|
2872
3073
|
// src/evaluation/evaluators.ts
|
|
2873
3074
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
2874
3075
|
|
|
@@ -3833,6 +4034,7 @@ async function runEvaluatorList(options) {
|
|
|
3833
4034
|
name: evaluator.name,
|
|
3834
4035
|
type: evaluator.type,
|
|
3835
4036
|
score: score2.score,
|
|
4037
|
+
verdict: score2.verdict,
|
|
3836
4038
|
hits: score2.hits,
|
|
3837
4039
|
misses: score2.misses,
|
|
3838
4040
|
reasoning: score2.reasoning,
|
|
@@ -3860,6 +4062,40 @@ async function runEvaluatorList(options) {
|
|
|
3860
4062
|
name: evaluator.name,
|
|
3861
4063
|
type: evaluator.type,
|
|
3862
4064
|
score: score2.score,
|
|
4065
|
+
verdict: score2.verdict,
|
|
4066
|
+
hits: score2.hits,
|
|
4067
|
+
misses: score2.misses,
|
|
4068
|
+
reasoning: score2.reasoning,
|
|
4069
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4070
|
+
});
|
|
4071
|
+
continue;
|
|
4072
|
+
}
|
|
4073
|
+
if (evaluator.type === "rubric") {
|
|
4074
|
+
const rubricEvaluator = new RubricEvaluator({
|
|
4075
|
+
config: evaluator,
|
|
4076
|
+
resolveJudgeProvider: async (context) => {
|
|
4077
|
+
if (context.judgeProvider) {
|
|
4078
|
+
return context.judgeProvider;
|
|
4079
|
+
}
|
|
4080
|
+
return judgeProvider;
|
|
4081
|
+
}
|
|
4082
|
+
});
|
|
4083
|
+
const score2 = await rubricEvaluator.evaluate({
|
|
4084
|
+
evalCase,
|
|
4085
|
+
candidate,
|
|
4086
|
+
target,
|
|
4087
|
+
provider,
|
|
4088
|
+
attempt,
|
|
4089
|
+
promptInputs,
|
|
4090
|
+
now,
|
|
4091
|
+
judgeProvider
|
|
4092
|
+
});
|
|
4093
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
4094
|
+
evaluatorResults.push({
|
|
4095
|
+
name: evaluator.name,
|
|
4096
|
+
type: evaluator.type,
|
|
4097
|
+
score: score2.score,
|
|
4098
|
+
verdict: score2.verdict,
|
|
3863
4099
|
hits: score2.hits,
|
|
3864
4100
|
misses: score2.misses,
|
|
3865
4101
|
reasoning: score2.reasoning,
|
|
@@ -4090,6 +4326,86 @@ function isTimeoutLike(error) {
|
|
|
4090
4326
|
return value.includes("timeout");
|
|
4091
4327
|
}
|
|
4092
4328
|
|
|
4329
|
+
// src/evaluation/generators/rubric-generator.ts
|
|
4330
|
+
import { generateText as generateText3 } from "ai";
|
|
4331
|
+
import { z as z2 } from "zod";
|
|
4332
|
+
var rubricItemSchema = z2.object({
|
|
4333
|
+
id: z2.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
4334
|
+
description: z2.string().describe("What this rubric checks for"),
|
|
4335
|
+
weight: z2.number().default(1).describe("Relative importance (default 1.0)"),
|
|
4336
|
+
required: z2.boolean().default(true).describe("Whether this is a mandatory requirement")
|
|
4337
|
+
});
|
|
4338
|
+
var rubricGenerationSchema = z2.object({
|
|
4339
|
+
rubrics: z2.array(rubricItemSchema).describe("List of evaluation rubrics")
|
|
4340
|
+
});
|
|
4341
|
+
async function generateRubrics(options) {
|
|
4342
|
+
const { expectedOutcome, question, referenceAnswer, provider } = options;
|
|
4343
|
+
const prompt = buildPrompt(expectedOutcome, question, referenceAnswer);
|
|
4344
|
+
const model = provider.asLanguageModel?.();
|
|
4345
|
+
if (!model) {
|
|
4346
|
+
throw new Error("Provider does not support language model interface");
|
|
4347
|
+
}
|
|
4348
|
+
const system = `You are an expert at creating evaluation rubrics.
|
|
4349
|
+
You must return a valid JSON object matching this schema:
|
|
4350
|
+
{
|
|
4351
|
+
"rubrics": [
|
|
4352
|
+
{
|
|
4353
|
+
"id": "string (short identifier)",
|
|
4354
|
+
"description": "string (what to check)",
|
|
4355
|
+
"weight": number (default 1.0),
|
|
4356
|
+
"required": boolean (default true)
|
|
4357
|
+
}
|
|
4358
|
+
]
|
|
4359
|
+
}`;
|
|
4360
|
+
let result;
|
|
4361
|
+
let lastError;
|
|
4362
|
+
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
4363
|
+
try {
|
|
4364
|
+
const { text } = await generateText3({
|
|
4365
|
+
model,
|
|
4366
|
+
system,
|
|
4367
|
+
prompt
|
|
4368
|
+
});
|
|
4369
|
+
const cleaned = text.replace(/```json\n?|```/g, "").trim();
|
|
4370
|
+
result = rubricGenerationSchema.parse(JSON.parse(cleaned));
|
|
4371
|
+
break;
|
|
4372
|
+
} catch (e) {
|
|
4373
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
4374
|
+
}
|
|
4375
|
+
}
|
|
4376
|
+
if (!result) {
|
|
4377
|
+
throw new Error(`Failed to parse generated rubrics after 3 attempts: ${lastError?.message}`);
|
|
4378
|
+
}
|
|
4379
|
+
return result.rubrics;
|
|
4380
|
+
}
|
|
4381
|
+
function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
4382
|
+
const parts = [
|
|
4383
|
+
"You are an expert at creating evaluation rubrics.",
|
|
4384
|
+
"Given the expected outcome (and optionally the question and reference answer),",
|
|
4385
|
+
"generate a list of specific, measurable rubric items to evaluate whether an answer meets the expected outcome.",
|
|
4386
|
+
"",
|
|
4387
|
+
"Each rubric should:",
|
|
4388
|
+
"- Be specific and testable",
|
|
4389
|
+
"- Have a short, descriptive ID",
|
|
4390
|
+
"- Include a clear description of what to check",
|
|
4391
|
+
"- Indicate if it is required (mandatory) or optional",
|
|
4392
|
+
"- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
|
|
4393
|
+
"",
|
|
4394
|
+
"Generate 3-7 rubric items that comprehensively cover the expected outcome.",
|
|
4395
|
+
"",
|
|
4396
|
+
"[[ ## expected_outcome ## ]]",
|
|
4397
|
+
expectedOutcome,
|
|
4398
|
+
""
|
|
4399
|
+
];
|
|
4400
|
+
if (question && question.trim().length > 0) {
|
|
4401
|
+
parts.push("[[ ## question ## ]]", question, "");
|
|
4402
|
+
}
|
|
4403
|
+
if (referenceAnswer && referenceAnswer.trim().length > 0) {
|
|
4404
|
+
parts.push("[[ ## reference_answer ## ]]", referenceAnswer, "");
|
|
4405
|
+
}
|
|
4406
|
+
return parts.join("\n");
|
|
4407
|
+
}
|
|
4408
|
+
|
|
4093
4409
|
// src/index.ts
|
|
4094
4410
|
function createAgentKernel() {
|
|
4095
4411
|
return { status: "stub" };
|
|
@@ -4097,6 +4413,7 @@ function createAgentKernel() {
|
|
|
4097
4413
|
export {
|
|
4098
4414
|
CodeEvaluator,
|
|
4099
4415
|
LlmJudgeEvaluator,
|
|
4416
|
+
RubricEvaluator,
|
|
4100
4417
|
TEST_MESSAGE_ROLES,
|
|
4101
4418
|
buildDirectoryChain,
|
|
4102
4419
|
buildPromptInputs,
|
|
@@ -4108,6 +4425,7 @@ export {
|
|
|
4108
4425
|
extractCodeBlocks,
|
|
4109
4426
|
fileExists,
|
|
4110
4427
|
findGitRoot,
|
|
4428
|
+
generateRubrics,
|
|
4111
4429
|
getHitCount,
|
|
4112
4430
|
isEvaluatorKind,
|
|
4113
4431
|
isGuidelineFile,
|