@agentv/core 0.7.2 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +23 -49
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +3 -3
- package/dist/index.d.ts +3 -3
- package/dist/index.js +23 -49
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -95,7 +95,7 @@ type LlmJudgeEvaluatorConfig = {
|
|
|
95
95
|
};
|
|
96
96
|
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
|
|
97
97
|
/**
|
|
98
|
-
*
|
|
98
|
+
* Eval case definition sourced from AgentV specs.
|
|
99
99
|
*/
|
|
100
100
|
interface EvalCase {
|
|
101
101
|
readonly id: string;
|
|
@@ -104,7 +104,6 @@ interface EvalCase {
|
|
|
104
104
|
readonly question: string;
|
|
105
105
|
readonly input_segments: readonly JsonObject[];
|
|
106
106
|
readonly output_segments: readonly JsonObject[];
|
|
107
|
-
readonly system_message?: string;
|
|
108
107
|
readonly reference_answer: string;
|
|
109
108
|
readonly guideline_paths: readonly string[];
|
|
110
109
|
readonly guideline_patterns?: readonly string[];
|
|
@@ -115,7 +114,7 @@ interface EvalCase {
|
|
|
115
114
|
readonly evaluators?: readonly EvaluatorConfig[];
|
|
116
115
|
}
|
|
117
116
|
/**
|
|
118
|
-
* Evaluator scorecard for a single
|
|
117
|
+
* Evaluator scorecard for a single eval case run.
|
|
119
118
|
*/
|
|
120
119
|
interface EvaluationResult {
|
|
121
120
|
readonly eval_id: string;
|
|
@@ -159,6 +158,7 @@ declare function isGuidelineFile(filePath: string, patterns?: readonly string[])
|
|
|
159
158
|
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
160
159
|
type LoadOptions = {
|
|
161
160
|
readonly verbose?: boolean;
|
|
161
|
+
readonly evalId?: string;
|
|
162
162
|
};
|
|
163
163
|
/**
|
|
164
164
|
* Load eval cases from a AgentV YAML specification file.
|
package/dist/index.d.ts
CHANGED
|
@@ -95,7 +95,7 @@ type LlmJudgeEvaluatorConfig = {
|
|
|
95
95
|
};
|
|
96
96
|
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
|
|
97
97
|
/**
|
|
98
|
-
*
|
|
98
|
+
* Eval case definition sourced from AgentV specs.
|
|
99
99
|
*/
|
|
100
100
|
interface EvalCase {
|
|
101
101
|
readonly id: string;
|
|
@@ -104,7 +104,6 @@ interface EvalCase {
|
|
|
104
104
|
readonly question: string;
|
|
105
105
|
readonly input_segments: readonly JsonObject[];
|
|
106
106
|
readonly output_segments: readonly JsonObject[];
|
|
107
|
-
readonly system_message?: string;
|
|
108
107
|
readonly reference_answer: string;
|
|
109
108
|
readonly guideline_paths: readonly string[];
|
|
110
109
|
readonly guideline_patterns?: readonly string[];
|
|
@@ -115,7 +114,7 @@ interface EvalCase {
|
|
|
115
114
|
readonly evaluators?: readonly EvaluatorConfig[];
|
|
116
115
|
}
|
|
117
116
|
/**
|
|
118
|
-
* Evaluator scorecard for a single
|
|
117
|
+
* Evaluator scorecard for a single eval case run.
|
|
119
118
|
*/
|
|
120
119
|
interface EvaluationResult {
|
|
121
120
|
readonly eval_id: string;
|
|
@@ -159,6 +158,7 @@ declare function isGuidelineFile(filePath: string, patterns?: readonly string[])
|
|
|
159
158
|
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
160
159
|
type LoadOptions = {
|
|
161
160
|
readonly verbose?: boolean;
|
|
161
|
+
readonly evalId?: string;
|
|
162
162
|
};
|
|
163
163
|
/**
|
|
164
164
|
* Load eval cases from a AgentV YAML specification file.
|
package/dist/index.js
CHANGED
|
@@ -218,6 +218,7 @@ async function processMessages(options) {
|
|
|
218
218
|
}
|
|
219
219
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
220
220
|
const verbose = options?.verbose ?? false;
|
|
221
|
+
const evalIdFilter = options?.evalId;
|
|
221
222
|
const absoluteTestPath = path.resolve(evalFilePath);
|
|
222
223
|
if (!await fileExists2(absoluteTestPath)) {
|
|
223
224
|
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
@@ -249,62 +250,39 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
249
250
|
const results = [];
|
|
250
251
|
for (const rawEvalcase of rawTestcases) {
|
|
251
252
|
if (!isJsonObject(rawEvalcase)) {
|
|
252
|
-
logWarning("Skipping invalid
|
|
253
|
+
logWarning("Skipping invalid eval case entry (expected object)");
|
|
253
254
|
continue;
|
|
254
255
|
}
|
|
255
256
|
const evalcase = rawEvalcase;
|
|
256
257
|
const id = asString(evalcase.id);
|
|
258
|
+
if (evalIdFilter && id !== evalIdFilter) {
|
|
259
|
+
continue;
|
|
260
|
+
}
|
|
257
261
|
const conversationId = asString(evalcase.conversation_id);
|
|
258
262
|
const outcome = asString(evalcase.outcome);
|
|
259
263
|
const inputMessagesValue = evalcase.input_messages;
|
|
260
264
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
261
265
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
262
|
-
logWarning(`Skipping incomplete
|
|
266
|
+
logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
263
267
|
continue;
|
|
264
268
|
}
|
|
265
269
|
if (!Array.isArray(expectedMessagesValue)) {
|
|
266
|
-
logWarning(`
|
|
270
|
+
logWarning(`Eval case '${id}' missing expected_messages array`);
|
|
267
271
|
continue;
|
|
268
272
|
}
|
|
269
273
|
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
270
274
|
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
const systemMessages = inputMessages.filter((message) => message.role === "system");
|
|
274
|
-
if (assistantMessages.length === 0) {
|
|
275
|
-
logWarning(`No assistant message found for test case: ${id}`);
|
|
275
|
+
if (expectedMessages.length === 0) {
|
|
276
|
+
logWarning(`No expected message found for eval case: ${id}`);
|
|
276
277
|
continue;
|
|
277
278
|
}
|
|
278
|
-
if (
|
|
279
|
-
logWarning(`Multiple
|
|
280
|
-
}
|
|
281
|
-
if (systemMessages.length > 1) {
|
|
282
|
-
logWarning(`Multiple system messages found for test case: ${id}, using first`);
|
|
283
|
-
}
|
|
284
|
-
let systemMessageContent;
|
|
285
|
-
if (systemMessages.length > 0) {
|
|
286
|
-
const content = systemMessages[0]?.content;
|
|
287
|
-
if (typeof content === "string") {
|
|
288
|
-
systemMessageContent = content;
|
|
289
|
-
} else if (Array.isArray(content)) {
|
|
290
|
-
const textParts = [];
|
|
291
|
-
for (const segment of content) {
|
|
292
|
-
if (isJsonObject(segment)) {
|
|
293
|
-
const value = segment.value;
|
|
294
|
-
if (typeof value === "string") {
|
|
295
|
-
textParts.push(value);
|
|
296
|
-
}
|
|
297
|
-
}
|
|
298
|
-
}
|
|
299
|
-
if (textParts.length > 0) {
|
|
300
|
-
systemMessageContent = textParts.join("\n\n");
|
|
301
|
-
}
|
|
302
|
-
}
|
|
279
|
+
if (expectedMessages.length > 1) {
|
|
280
|
+
logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
303
281
|
}
|
|
304
282
|
const guidelinePaths = [];
|
|
305
283
|
const inputTextParts = [];
|
|
306
284
|
const inputSegments = await processMessages({
|
|
307
|
-
messages:
|
|
285
|
+
messages: inputMessages,
|
|
308
286
|
searchRoots,
|
|
309
287
|
repoRootPath,
|
|
310
288
|
guidelinePatterns,
|
|
@@ -314,7 +292,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
314
292
|
verbose
|
|
315
293
|
});
|
|
316
294
|
const outputSegments = await processMessages({
|
|
317
|
-
messages:
|
|
295
|
+
messages: expectedMessages,
|
|
318
296
|
searchRoots,
|
|
319
297
|
repoRootPath,
|
|
320
298
|
guidelinePatterns,
|
|
@@ -322,10 +300,10 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
322
300
|
verbose
|
|
323
301
|
});
|
|
324
302
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
325
|
-
const
|
|
326
|
-
const referenceAnswer = await resolveAssistantContent(
|
|
303
|
+
const expectedContent = expectedMessages[0]?.content;
|
|
304
|
+
const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
|
|
327
305
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
328
|
-
const
|
|
306
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
329
307
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
330
308
|
const userFilePaths = [];
|
|
331
309
|
for (const segment of inputSegments) {
|
|
@@ -344,19 +322,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
344
322
|
question,
|
|
345
323
|
input_segments: inputSegments,
|
|
346
324
|
output_segments: outputSegments,
|
|
347
|
-
system_message: systemMessageContent,
|
|
348
325
|
reference_answer: referenceAnswer,
|
|
349
326
|
guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
|
|
350
327
|
guideline_patterns: guidelinePatterns,
|
|
351
328
|
file_paths: allFilePaths,
|
|
352
329
|
code_snippets: codeSnippets,
|
|
353
330
|
expected_outcome: outcome,
|
|
354
|
-
evaluator:
|
|
331
|
+
evaluator: evalCaseEvaluatorKind,
|
|
355
332
|
evaluators
|
|
356
333
|
};
|
|
357
334
|
if (verbose) {
|
|
358
335
|
console.log(`
|
|
359
|
-
[
|
|
336
|
+
[Eval Case: ${id}]`);
|
|
360
337
|
if (testCase.guideline_paths.length > 0) {
|
|
361
338
|
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
362
339
|
for (const guidelinePath of testCase.guideline_paths) {
|
|
@@ -415,7 +392,7 @@ ${body}`);
|
|
|
415
392
|
}
|
|
416
393
|
const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
417
394
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
418
|
-
return { question, guidelines
|
|
395
|
+
return { question, guidelines };
|
|
419
396
|
}
|
|
420
397
|
async function fileExists2(absolutePath) {
|
|
421
398
|
try {
|
|
@@ -2918,7 +2895,6 @@ var CodeEvaluator = class {
|
|
|
2918
2895
|
expected_outcome: context.evalCase.expected_outcome,
|
|
2919
2896
|
reference_answer: context.evalCase.reference_answer,
|
|
2920
2897
|
candidate_answer: context.candidate,
|
|
2921
|
-
system_message: context.promptInputs.systemMessage ?? "",
|
|
2922
2898
|
guideline_paths: context.evalCase.guideline_paths,
|
|
2923
2899
|
input_files: context.evalCase.file_paths,
|
|
2924
2900
|
input_segments: context.evalCase.input_segments
|
|
@@ -3160,7 +3136,7 @@ function validateConcurrency(concurrency) {
|
|
|
3160
3136
|
// src/evaluation/orchestrator.ts
|
|
3161
3137
|
async function runEvaluation(options) {
|
|
3162
3138
|
const {
|
|
3163
|
-
testFilePath,
|
|
3139
|
+
testFilePath: evalFilePath,
|
|
3164
3140
|
repoRoot,
|
|
3165
3141
|
target,
|
|
3166
3142
|
targets,
|
|
@@ -3179,11 +3155,11 @@ async function runEvaluation(options) {
|
|
|
3179
3155
|
onProgress
|
|
3180
3156
|
} = options;
|
|
3181
3157
|
const load = loadEvalCases;
|
|
3182
|
-
const evalCases = await load(
|
|
3158
|
+
const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
|
|
3183
3159
|
const filteredEvalCases = filterEvalCases(evalCases, evalId);
|
|
3184
3160
|
if (filteredEvalCases.length === 0) {
|
|
3185
3161
|
if (evalId) {
|
|
3186
|
-
throw new Error(`
|
|
3162
|
+
throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
|
|
3187
3163
|
}
|
|
3188
3164
|
return [];
|
|
3189
3165
|
}
|
|
@@ -3562,8 +3538,7 @@ async function evaluateCandidate(options) {
|
|
|
3562
3538
|
const rawRequest = {
|
|
3563
3539
|
question: promptInputs.question,
|
|
3564
3540
|
...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
|
|
3565
|
-
guideline_paths: evalCase.guideline_paths
|
|
3566
|
-
system_message: promptInputs.systemMessage ?? ""
|
|
3541
|
+
guideline_paths: evalCase.guideline_paths
|
|
3567
3542
|
};
|
|
3568
3543
|
return {
|
|
3569
3544
|
eval_id: evalCase.id,
|
|
@@ -3827,7 +3802,6 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
3827
3802
|
question: promptInputs.question,
|
|
3828
3803
|
...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
|
|
3829
3804
|
guideline_paths: evalCase.guideline_paths,
|
|
3830
|
-
system_message: promptInputs.systemMessage ?? "",
|
|
3831
3805
|
error: message
|
|
3832
3806
|
};
|
|
3833
3807
|
return {
|