@agentv/core 0.25.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-OYTL3LNN.js → chunk-V3JCB3HI.js} +5 -2
- package/dist/chunk-V3JCB3HI.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +93 -32
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +13 -7
- package/dist/index.d.ts +13 -7
- package/dist/index.js +89 -31
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-OYTL3LNN.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -56,6 +56,8 @@ interface ToolTrajectoryEvaluatorConfig {
|
|
|
56
56
|
readonly minimums?: Readonly<Record<string, number>>;
|
|
57
57
|
/** Expected tool sequence (for in_order/exact modes) */
|
|
58
58
|
readonly expected?: readonly ToolTrajectoryExpectedItem[];
|
|
59
|
+
/** Optional weight for top-level aggregation (defaults to 1.0) */
|
|
60
|
+
readonly weight?: number;
|
|
59
61
|
}
|
|
60
62
|
/**
|
|
61
63
|
* Expected tool call item in a trajectory sequence.
|
|
@@ -176,7 +178,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
176
178
|
* Guard validating raw test messages.
|
|
177
179
|
*/
|
|
178
180
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
179
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "
|
|
181
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_tool_calls"];
|
|
180
182
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
181
183
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
182
184
|
type CodeEvaluatorConfig = {
|
|
@@ -186,6 +188,7 @@ type CodeEvaluatorConfig = {
|
|
|
186
188
|
readonly resolvedScriptPath?: string;
|
|
187
189
|
readonly cwd?: string;
|
|
188
190
|
readonly resolvedCwd?: string;
|
|
191
|
+
readonly weight?: number;
|
|
189
192
|
};
|
|
190
193
|
type LlmJudgeEvaluatorConfig = {
|
|
191
194
|
readonly name: string;
|
|
@@ -193,6 +196,7 @@ type LlmJudgeEvaluatorConfig = {
|
|
|
193
196
|
readonly prompt?: string;
|
|
194
197
|
readonly promptPath?: string;
|
|
195
198
|
readonly rubrics?: readonly RubricItem[];
|
|
199
|
+
readonly weight?: number;
|
|
196
200
|
};
|
|
197
201
|
type RubricItem = {
|
|
198
202
|
readonly id: string;
|
|
@@ -218,12 +222,14 @@ type CompositeEvaluatorConfig = {
|
|
|
218
222
|
readonly type: 'composite';
|
|
219
223
|
readonly evaluators: readonly EvaluatorConfig[];
|
|
220
224
|
readonly aggregator: CompositeAggregatorConfig;
|
|
225
|
+
readonly weight?: number;
|
|
221
226
|
};
|
|
222
|
-
type
|
|
227
|
+
type ExpectedToolCallsEvaluatorConfig = {
|
|
223
228
|
readonly name: string;
|
|
224
|
-
readonly type: '
|
|
229
|
+
readonly type: 'expected_tool_calls';
|
|
230
|
+
readonly weight?: number;
|
|
225
231
|
};
|
|
226
|
-
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig |
|
|
232
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedToolCallsEvaluatorConfig;
|
|
227
233
|
/**
|
|
228
234
|
* Eval case definition sourced from AgentV specs.
|
|
229
235
|
*/
|
|
@@ -764,8 +770,8 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
|
764
770
|
* Extracts tool_calls from assistant messages in expected_messages and compares them
|
|
765
771
|
* sequentially against tool_call events in the trace.
|
|
766
772
|
*/
|
|
767
|
-
declare class
|
|
768
|
-
readonly kind = "
|
|
773
|
+
declare class ExpectedToolCallsEvaluator implements Evaluator {
|
|
774
|
+
readonly kind = "expected_tool_calls";
|
|
769
775
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
770
776
|
private extractExpectedToolCalls;
|
|
771
777
|
private validateToolCalls;
|
|
@@ -861,4 +867,4 @@ type AgentKernel = {
|
|
|
861
867
|
};
|
|
862
868
|
declare function createAgentKernel(): AgentKernel;
|
|
863
869
|
|
|
864
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult,
|
|
870
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExpectedToolCall, ExpectedToolCallsEvaluator, type ExpectedToolCallsEvaluatorConfig, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
package/dist/index.d.ts
CHANGED
|
@@ -56,6 +56,8 @@ interface ToolTrajectoryEvaluatorConfig {
|
|
|
56
56
|
readonly minimums?: Readonly<Record<string, number>>;
|
|
57
57
|
/** Expected tool sequence (for in_order/exact modes) */
|
|
58
58
|
readonly expected?: readonly ToolTrajectoryExpectedItem[];
|
|
59
|
+
/** Optional weight for top-level aggregation (defaults to 1.0) */
|
|
60
|
+
readonly weight?: number;
|
|
59
61
|
}
|
|
60
62
|
/**
|
|
61
63
|
* Expected tool call item in a trajectory sequence.
|
|
@@ -176,7 +178,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
176
178
|
* Guard validating raw test messages.
|
|
177
179
|
*/
|
|
178
180
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
179
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "
|
|
181
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_tool_calls"];
|
|
180
182
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
181
183
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
182
184
|
type CodeEvaluatorConfig = {
|
|
@@ -186,6 +188,7 @@ type CodeEvaluatorConfig = {
|
|
|
186
188
|
readonly resolvedScriptPath?: string;
|
|
187
189
|
readonly cwd?: string;
|
|
188
190
|
readonly resolvedCwd?: string;
|
|
191
|
+
readonly weight?: number;
|
|
189
192
|
};
|
|
190
193
|
type LlmJudgeEvaluatorConfig = {
|
|
191
194
|
readonly name: string;
|
|
@@ -193,6 +196,7 @@ type LlmJudgeEvaluatorConfig = {
|
|
|
193
196
|
readonly prompt?: string;
|
|
194
197
|
readonly promptPath?: string;
|
|
195
198
|
readonly rubrics?: readonly RubricItem[];
|
|
199
|
+
readonly weight?: number;
|
|
196
200
|
};
|
|
197
201
|
type RubricItem = {
|
|
198
202
|
readonly id: string;
|
|
@@ -218,12 +222,14 @@ type CompositeEvaluatorConfig = {
|
|
|
218
222
|
readonly type: 'composite';
|
|
219
223
|
readonly evaluators: readonly EvaluatorConfig[];
|
|
220
224
|
readonly aggregator: CompositeAggregatorConfig;
|
|
225
|
+
readonly weight?: number;
|
|
221
226
|
};
|
|
222
|
-
type
|
|
227
|
+
type ExpectedToolCallsEvaluatorConfig = {
|
|
223
228
|
readonly name: string;
|
|
224
|
-
readonly type: '
|
|
229
|
+
readonly type: 'expected_tool_calls';
|
|
230
|
+
readonly weight?: number;
|
|
225
231
|
};
|
|
226
|
-
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig |
|
|
232
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedToolCallsEvaluatorConfig;
|
|
227
233
|
/**
|
|
228
234
|
* Eval case definition sourced from AgentV specs.
|
|
229
235
|
*/
|
|
@@ -764,8 +770,8 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
|
764
770
|
* Extracts tool_calls from assistant messages in expected_messages and compares them
|
|
765
771
|
* sequentially against tool_call events in the trace.
|
|
766
772
|
*/
|
|
767
|
-
declare class
|
|
768
|
-
readonly kind = "
|
|
773
|
+
declare class ExpectedToolCallsEvaluator implements Evaluator {
|
|
774
|
+
readonly kind = "expected_tool_calls";
|
|
769
775
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
770
776
|
private extractExpectedToolCalls;
|
|
771
777
|
private validateToolCalls;
|
|
@@ -861,4 +867,4 @@ type AgentKernel = {
|
|
|
861
867
|
};
|
|
862
868
|
declare function createAgentKernel(): AgentKernel;
|
|
863
869
|
|
|
864
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult,
|
|
870
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExpectedToolCall, ExpectedToolCallsEvaluator, type ExpectedToolCallsEvaluatorConfig, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
|
package/dist/index.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
readTextFile,
|
|
10
10
|
resolveFileReference,
|
|
11
11
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-V3JCB3HI.js";
|
|
13
13
|
|
|
14
14
|
// src/evaluation/types.ts
|
|
15
15
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -58,7 +58,7 @@ var EVALUATOR_KIND_VALUES = [
|
|
|
58
58
|
"rubric",
|
|
59
59
|
"composite",
|
|
60
60
|
"tool_trajectory",
|
|
61
|
-
"
|
|
61
|
+
"expected_tool_calls"
|
|
62
62
|
];
|
|
63
63
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
64
64
|
function isEvaluatorKind(value) {
|
|
@@ -455,6 +455,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
455
455
|
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
|
|
456
456
|
continue;
|
|
457
457
|
}
|
|
458
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
458
459
|
const cwd = asString2(rawEvaluator.cwd);
|
|
459
460
|
let resolvedCwd;
|
|
460
461
|
if (cwd) {
|
|
@@ -475,7 +476,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
475
476
|
type: "code",
|
|
476
477
|
script,
|
|
477
478
|
cwd,
|
|
478
|
-
resolvedCwd
|
|
479
|
+
resolvedCwd,
|
|
480
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
479
481
|
});
|
|
480
482
|
continue;
|
|
481
483
|
}
|
|
@@ -570,18 +572,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
570
572
|
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
571
573
|
};
|
|
572
574
|
}
|
|
575
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
573
576
|
evaluators.push({
|
|
574
577
|
name,
|
|
575
578
|
type: "composite",
|
|
576
579
|
evaluators: memberEvaluators,
|
|
577
|
-
aggregator
|
|
580
|
+
aggregator,
|
|
581
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
578
582
|
});
|
|
579
583
|
continue;
|
|
580
584
|
}
|
|
581
|
-
if (typeValue === "
|
|
585
|
+
if (typeValue === "expected_tool_calls") {
|
|
586
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
582
587
|
evaluators.push({
|
|
583
588
|
name,
|
|
584
|
-
type: "
|
|
589
|
+
type: "expected_tool_calls",
|
|
590
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
585
591
|
});
|
|
586
592
|
continue;
|
|
587
593
|
}
|
|
@@ -637,12 +643,14 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
637
643
|
);
|
|
638
644
|
continue;
|
|
639
645
|
}
|
|
646
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
640
647
|
const config = {
|
|
641
648
|
name,
|
|
642
649
|
type: "tool_trajectory",
|
|
643
650
|
mode,
|
|
644
651
|
...minimums ? { minimums } : {},
|
|
645
|
-
...expected ? { expected } : {}
|
|
652
|
+
...expected ? { expected } : {},
|
|
653
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
646
654
|
};
|
|
647
655
|
evaluators.push(config);
|
|
648
656
|
continue;
|
|
@@ -683,19 +691,23 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
683
691
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
|
|
684
692
|
continue;
|
|
685
693
|
}
|
|
694
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
686
695
|
evaluators.push({
|
|
687
696
|
name,
|
|
688
697
|
type: "llm_judge",
|
|
689
|
-
rubrics: parsedRubrics
|
|
698
|
+
rubrics: parsedRubrics,
|
|
699
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
690
700
|
});
|
|
691
701
|
continue;
|
|
692
702
|
}
|
|
703
|
+
const weight = validateWeight(rawEvaluator.weight, name, evalId);
|
|
693
704
|
evaluators.push({
|
|
694
705
|
name,
|
|
695
706
|
type: "llm_judge",
|
|
696
707
|
prompt,
|
|
697
708
|
promptPath,
|
|
698
|
-
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
|
|
709
|
+
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
|
|
710
|
+
...weight !== void 0 ? { weight } : {}
|
|
699
711
|
});
|
|
700
712
|
}
|
|
701
713
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -725,6 +737,27 @@ ${detailBlock}${ANSI_RESET3}`);
|
|
|
725
737
|
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
726
738
|
}
|
|
727
739
|
}
|
|
740
|
+
function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
741
|
+
if (rawWeight === void 0) {
|
|
742
|
+
return void 0;
|
|
743
|
+
}
|
|
744
|
+
if (typeof rawWeight !== "number") {
|
|
745
|
+
throw new Error(
|
|
746
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be a number`
|
|
747
|
+
);
|
|
748
|
+
}
|
|
749
|
+
if (!Number.isFinite(rawWeight)) {
|
|
750
|
+
throw new Error(
|
|
751
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be finite (got ${rawWeight})`
|
|
752
|
+
);
|
|
753
|
+
}
|
|
754
|
+
if (rawWeight < 0) {
|
|
755
|
+
throw new Error(
|
|
756
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be non-negative (got ${rawWeight})`
|
|
757
|
+
);
|
|
758
|
+
}
|
|
759
|
+
return rawWeight;
|
|
760
|
+
}
|
|
728
761
|
|
|
729
762
|
// src/evaluation/loaders/message-processor.ts
|
|
730
763
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
@@ -3510,9 +3543,11 @@ var CodeEvaluator = class {
|
|
|
3510
3543
|
expected_outcome: context.evalCase.expected_outcome,
|
|
3511
3544
|
reference_answer: context.evalCase.reference_answer,
|
|
3512
3545
|
candidate_answer: context.candidate,
|
|
3513
|
-
|
|
3514
|
-
input_files: context.evalCase.file_paths
|
|
3515
|
-
|
|
3546
|
+
guideline_files: context.evalCase.guideline_paths,
|
|
3547
|
+
input_files: context.evalCase.file_paths.filter(
|
|
3548
|
+
(path13) => !context.evalCase.guideline_paths.includes(path13)
|
|
3549
|
+
),
|
|
3550
|
+
input_messages: context.evalCase.input_messages
|
|
3516
3551
|
},
|
|
3517
3552
|
null,
|
|
3518
3553
|
2
|
|
@@ -3778,8 +3813,8 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3778
3813
|
};
|
|
3779
3814
|
}
|
|
3780
3815
|
};
|
|
3781
|
-
var
|
|
3782
|
-
kind = "
|
|
3816
|
+
var ExpectedToolCallsEvaluator = class {
|
|
3817
|
+
kind = "expected_tool_calls";
|
|
3783
3818
|
evaluate(context) {
|
|
3784
3819
|
const { candidateTrace, evalCase } = context;
|
|
3785
3820
|
const expectedSegments = evalCase.expected_segments;
|
|
@@ -4685,14 +4720,12 @@ async function evaluateCandidate(options) {
|
|
|
4685
4720
|
} else {
|
|
4686
4721
|
if (promptInputs.chatPrompt) {
|
|
4687
4722
|
lmProviderRequest = {
|
|
4688
|
-
chat_prompt: promptInputs.chatPrompt
|
|
4689
|
-
guideline_paths: evalCase.guideline_paths
|
|
4723
|
+
chat_prompt: promptInputs.chatPrompt
|
|
4690
4724
|
};
|
|
4691
4725
|
} else {
|
|
4692
4726
|
lmProviderRequest = {
|
|
4693
4727
|
question: promptInputs.question,
|
|
4694
|
-
guidelines: promptInputs.guidelines
|
|
4695
|
-
guideline_paths: evalCase.guideline_paths
|
|
4728
|
+
guidelines: promptInputs.guidelines
|
|
4696
4729
|
};
|
|
4697
4730
|
}
|
|
4698
4731
|
}
|
|
@@ -4799,11 +4832,13 @@ async function runEvaluatorList(options) {
|
|
|
4799
4832
|
now,
|
|
4800
4833
|
judgeProvider
|
|
4801
4834
|
});
|
|
4802
|
-
|
|
4835
|
+
const weight = evaluator.weight ?? 1;
|
|
4836
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
4803
4837
|
evaluatorResults.push({
|
|
4804
4838
|
name: evaluator.name,
|
|
4805
4839
|
type: evaluator.type,
|
|
4806
4840
|
score: score2.score,
|
|
4841
|
+
weight,
|
|
4807
4842
|
verdict: score2.verdict,
|
|
4808
4843
|
hits: score2.hits,
|
|
4809
4844
|
misses: score2.misses,
|
|
@@ -4826,11 +4861,13 @@ async function runEvaluatorList(options) {
|
|
|
4826
4861
|
promptInputs,
|
|
4827
4862
|
now
|
|
4828
4863
|
});
|
|
4829
|
-
|
|
4864
|
+
const weight = evaluator.weight ?? 1;
|
|
4865
|
+
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
4830
4866
|
evaluatorResults.push({
|
|
4831
4867
|
name: evaluator.name,
|
|
4832
4868
|
type: "code_judge",
|
|
4833
4869
|
score: score2.score,
|
|
4870
|
+
weight,
|
|
4834
4871
|
verdict: score2.verdict,
|
|
4835
4872
|
hits: score2.hits,
|
|
4836
4873
|
misses: score2.misses,
|
|
@@ -4860,8 +4897,8 @@ async function runEvaluatorList(options) {
|
|
|
4860
4897
|
return new ToolTrajectoryEvaluator({
|
|
4861
4898
|
config: memberConfig
|
|
4862
4899
|
});
|
|
4863
|
-
case "
|
|
4864
|
-
return new
|
|
4900
|
+
case "expected_tool_calls":
|
|
4901
|
+
return new ExpectedToolCallsEvaluator();
|
|
4865
4902
|
default: {
|
|
4866
4903
|
const unknownConfig = memberConfig;
|
|
4867
4904
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -4883,11 +4920,13 @@ async function runEvaluatorList(options) {
|
|
|
4883
4920
|
now,
|
|
4884
4921
|
judgeProvider
|
|
4885
4922
|
});
|
|
4886
|
-
|
|
4923
|
+
const weight = evaluator.weight ?? 1;
|
|
4924
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
4887
4925
|
evaluatorResults.push({
|
|
4888
4926
|
name: evaluator.name,
|
|
4889
4927
|
type: evaluator.type,
|
|
4890
4928
|
score: score2.score,
|
|
4929
|
+
weight,
|
|
4891
4930
|
verdict: score2.verdict,
|
|
4892
4931
|
hits: score2.hits,
|
|
4893
4932
|
misses: score2.misses,
|
|
@@ -4911,20 +4950,22 @@ async function runEvaluatorList(options) {
|
|
|
4911
4950
|
candidateTrace,
|
|
4912
4951
|
candidateTraceSummary
|
|
4913
4952
|
});
|
|
4914
|
-
|
|
4953
|
+
const weight = evaluator.weight ?? 1;
|
|
4954
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
4915
4955
|
evaluatorResults.push({
|
|
4916
4956
|
name: evaluator.name,
|
|
4917
4957
|
type: evaluator.type,
|
|
4918
4958
|
score: score2.score,
|
|
4959
|
+
weight,
|
|
4919
4960
|
verdict: score2.verdict,
|
|
4920
4961
|
hits: score2.hits,
|
|
4921
4962
|
misses: score2.misses,
|
|
4922
4963
|
reasoning: score2.reasoning
|
|
4923
4964
|
});
|
|
4924
4965
|
}
|
|
4925
|
-
if (evaluator.type === "
|
|
4926
|
-
const
|
|
4927
|
-
const score2 =
|
|
4966
|
+
if (evaluator.type === "expected_tool_calls") {
|
|
4967
|
+
const expectedToolCallsEvaluator = new ExpectedToolCallsEvaluator();
|
|
4968
|
+
const score2 = expectedToolCallsEvaluator.evaluate({
|
|
4928
4969
|
evalCase,
|
|
4929
4970
|
candidate,
|
|
4930
4971
|
target,
|
|
@@ -4935,11 +4976,13 @@ async function runEvaluatorList(options) {
|
|
|
4935
4976
|
candidateTrace,
|
|
4936
4977
|
candidateTraceSummary
|
|
4937
4978
|
});
|
|
4938
|
-
|
|
4979
|
+
const weight = evaluator.weight ?? 1;
|
|
4980
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
4939
4981
|
evaluatorResults.push({
|
|
4940
4982
|
name: evaluator.name,
|
|
4941
4983
|
type: evaluator.type,
|
|
4942
4984
|
score: score2.score,
|
|
4985
|
+
weight,
|
|
4943
4986
|
verdict: score2.verdict,
|
|
4944
4987
|
hits: score2.hits,
|
|
4945
4988
|
misses: score2.misses,
|
|
@@ -4957,15 +5000,18 @@ async function runEvaluatorList(options) {
|
|
|
4957
5000
|
reasoning: message
|
|
4958
5001
|
};
|
|
4959
5002
|
const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
|
|
5003
|
+
const weight = evaluator.weight ?? 1;
|
|
4960
5004
|
scored.push({
|
|
4961
5005
|
score: fallbackScore,
|
|
4962
5006
|
name: evaluator.name ?? "unknown",
|
|
4963
|
-
type: resultType ?? "llm_judge"
|
|
5007
|
+
type: resultType ?? "llm_judge",
|
|
5008
|
+
weight
|
|
4964
5009
|
});
|
|
4965
5010
|
evaluatorResults.push({
|
|
4966
5011
|
name: evaluator.name ?? "unknown",
|
|
4967
5012
|
type: resultType ?? "llm_judge",
|
|
4968
5013
|
score: 0,
|
|
5014
|
+
weight,
|
|
4969
5015
|
verdict: "fail",
|
|
4970
5016
|
hits: [],
|
|
4971
5017
|
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
@@ -4973,7 +5019,9 @@ async function runEvaluatorList(options) {
|
|
|
4973
5019
|
});
|
|
4974
5020
|
}
|
|
4975
5021
|
}
|
|
4976
|
-
const aggregateScore = scored.length > 0 ?
|
|
5022
|
+
const aggregateScore = scored.length > 0 ? computeWeightedMean(
|
|
5023
|
+
scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
5024
|
+
) : 0;
|
|
4977
5025
|
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
4978
5026
|
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
4979
5027
|
const expectedAspectCount = scored.reduce(
|
|
@@ -5199,6 +5247,16 @@ function mapChildResults(children) {
|
|
|
5199
5247
|
evaluator_results: mapChildResults(child.evaluatorResults)
|
|
5200
5248
|
}));
|
|
5201
5249
|
}
|
|
5250
|
+
function computeWeightedMean(entries) {
|
|
5251
|
+
let totalWeight = 0;
|
|
5252
|
+
let weightedSum = 0;
|
|
5253
|
+
for (const entry of entries) {
|
|
5254
|
+
const weight = entry.weight ?? 1;
|
|
5255
|
+
totalWeight += weight;
|
|
5256
|
+
weightedSum += entry.score * weight;
|
|
5257
|
+
}
|
|
5258
|
+
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
5259
|
+
}
|
|
5202
5260
|
|
|
5203
5261
|
// src/evaluation/generators/rubric-generator.ts
|
|
5204
5262
|
import { generateText as generateText3 } from "ai";
|
|
@@ -5287,7 +5345,7 @@ function createAgentKernel() {
|
|
|
5287
5345
|
export {
|
|
5288
5346
|
CodeEvaluator,
|
|
5289
5347
|
CompositeEvaluator,
|
|
5290
|
-
|
|
5348
|
+
ExpectedToolCallsEvaluator,
|
|
5291
5349
|
LlmJudgeEvaluator,
|
|
5292
5350
|
TEST_MESSAGE_ROLES,
|
|
5293
5351
|
ToolTrajectoryEvaluator,
|