@agentv/core 0.7.2 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -95,7 +95,7 @@ type LlmJudgeEvaluatorConfig = {
95
95
  };
96
96
  type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
97
97
  /**
98
- * Test case definition sourced from AgentV specs.
98
+ * Eval case definition sourced from AgentV specs.
99
99
  */
100
100
  interface EvalCase {
101
101
  readonly id: string;
@@ -104,7 +104,6 @@ interface EvalCase {
104
104
  readonly question: string;
105
105
  readonly input_segments: readonly JsonObject[];
106
106
  readonly output_segments: readonly JsonObject[];
107
- readonly system_message?: string;
108
107
  readonly reference_answer: string;
109
108
  readonly guideline_paths: readonly string[];
110
109
  readonly guideline_patterns?: readonly string[];
@@ -115,7 +114,7 @@ interface EvalCase {
115
114
  readonly evaluators?: readonly EvaluatorConfig[];
116
115
  }
117
116
  /**
118
- * Evaluator scorecard for a single test case run.
117
+ * Evaluator scorecard for a single eval case run.
119
118
  */
120
119
  interface EvaluationResult {
121
120
  readonly eval_id: string;
@@ -159,6 +158,7 @@ declare function isGuidelineFile(filePath: string, patterns?: readonly string[])
159
158
  declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
160
159
  type LoadOptions = {
161
160
  readonly verbose?: boolean;
161
+ readonly evalId?: string;
162
162
  };
163
163
  /**
164
164
  * Load eval cases from a AgentV YAML specification file.
package/dist/index.d.ts CHANGED
@@ -95,7 +95,7 @@ type LlmJudgeEvaluatorConfig = {
95
95
  };
96
96
  type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
97
97
  /**
98
- * Test case definition sourced from AgentV specs.
98
+ * Eval case definition sourced from AgentV specs.
99
99
  */
100
100
  interface EvalCase {
101
101
  readonly id: string;
@@ -104,7 +104,6 @@ interface EvalCase {
104
104
  readonly question: string;
105
105
  readonly input_segments: readonly JsonObject[];
106
106
  readonly output_segments: readonly JsonObject[];
107
- readonly system_message?: string;
108
107
  readonly reference_answer: string;
109
108
  readonly guideline_paths: readonly string[];
110
109
  readonly guideline_patterns?: readonly string[];
@@ -115,7 +114,7 @@ interface EvalCase {
115
114
  readonly evaluators?: readonly EvaluatorConfig[];
116
115
  }
117
116
  /**
118
- * Evaluator scorecard for a single test case run.
117
+ * Evaluator scorecard for a single eval case run.
119
118
  */
120
119
  interface EvaluationResult {
121
120
  readonly eval_id: string;
@@ -159,6 +158,7 @@ declare function isGuidelineFile(filePath: string, patterns?: readonly string[])
159
158
  declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
160
159
  type LoadOptions = {
161
160
  readonly verbose?: boolean;
161
+ readonly evalId?: string;
162
162
  };
163
163
  /**
164
164
  * Load eval cases from a AgentV YAML specification file.
package/dist/index.js CHANGED
@@ -218,6 +218,7 @@ async function processMessages(options) {
218
218
  }
219
219
  async function loadEvalCases(evalFilePath, repoRoot, options) {
220
220
  const verbose = options?.verbose ?? false;
221
+ const evalIdFilter = options?.evalId;
221
222
  const absoluteTestPath = path.resolve(evalFilePath);
222
223
  if (!await fileExists2(absoluteTestPath)) {
223
224
  throw new Error(`Test file not found: ${evalFilePath}`);
@@ -249,62 +250,39 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
249
250
  const results = [];
250
251
  for (const rawEvalcase of rawTestcases) {
251
252
  if (!isJsonObject(rawEvalcase)) {
252
- logWarning("Skipping invalid test case entry (expected object)");
253
+ logWarning("Skipping invalid eval case entry (expected object)");
253
254
  continue;
254
255
  }
255
256
  const evalcase = rawEvalcase;
256
257
  const id = asString(evalcase.id);
258
+ if (evalIdFilter && id !== evalIdFilter) {
259
+ continue;
260
+ }
257
261
  const conversationId = asString(evalcase.conversation_id);
258
262
  const outcome = asString(evalcase.outcome);
259
263
  const inputMessagesValue = evalcase.input_messages;
260
264
  const expectedMessagesValue = evalcase.expected_messages;
261
265
  if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
262
- logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
266
+ logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
263
267
  continue;
264
268
  }
265
269
  if (!Array.isArray(expectedMessagesValue)) {
266
- logWarning(`Test case '${id}' missing expected_messages array`);
270
+ logWarning(`Eval case '${id}' missing expected_messages array`);
267
271
  continue;
268
272
  }
269
273
  const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
270
274
  const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
271
- const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
272
- const userMessages = inputMessages.filter((message) => message.role === "user");
273
- const systemMessages = inputMessages.filter((message) => message.role === "system");
274
- if (assistantMessages.length === 0) {
275
- logWarning(`No assistant message found for test case: ${id}`);
275
+ if (expectedMessages.length === 0) {
276
+ logWarning(`No expected message found for eval case: ${id}`);
276
277
  continue;
277
278
  }
278
- if (assistantMessages.length > 1) {
279
- logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
280
- }
281
- if (systemMessages.length > 1) {
282
- logWarning(`Multiple system messages found for test case: ${id}, using first`);
283
- }
284
- let systemMessageContent;
285
- if (systemMessages.length > 0) {
286
- const content = systemMessages[0]?.content;
287
- if (typeof content === "string") {
288
- systemMessageContent = content;
289
- } else if (Array.isArray(content)) {
290
- const textParts = [];
291
- for (const segment of content) {
292
- if (isJsonObject(segment)) {
293
- const value = segment.value;
294
- if (typeof value === "string") {
295
- textParts.push(value);
296
- }
297
- }
298
- }
299
- if (textParts.length > 0) {
300
- systemMessageContent = textParts.join("\n\n");
301
- }
302
- }
279
+ if (expectedMessages.length > 1) {
280
+ logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
303
281
  }
304
282
  const guidelinePaths = [];
305
283
  const inputTextParts = [];
306
284
  const inputSegments = await processMessages({
307
- messages: userMessages,
285
+ messages: inputMessages,
308
286
  searchRoots,
309
287
  repoRootPath,
310
288
  guidelinePatterns,
@@ -314,7 +292,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
314
292
  verbose
315
293
  });
316
294
  const outputSegments = await processMessages({
317
- messages: assistantMessages,
295
+ messages: expectedMessages,
318
296
  searchRoots,
319
297
  repoRootPath,
320
298
  guidelinePatterns,
@@ -322,10 +300,10 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
322
300
  verbose
323
301
  });
324
302
  const codeSnippets = extractCodeBlocks(inputSegments);
325
- const assistantContent = assistantMessages[0]?.content;
326
- const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
303
+ const expectedContent = expectedMessages[0]?.content;
304
+ const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
327
305
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
328
- const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
306
+ const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
329
307
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
330
308
  const userFilePaths = [];
331
309
  for (const segment of inputSegments) {
@@ -344,19 +322,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
344
322
  question,
345
323
  input_segments: inputSegments,
346
324
  output_segments: outputSegments,
347
- system_message: systemMessageContent,
348
325
  reference_answer: referenceAnswer,
349
326
  guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
350
327
  guideline_patterns: guidelinePatterns,
351
328
  file_paths: allFilePaths,
352
329
  code_snippets: codeSnippets,
353
330
  expected_outcome: outcome,
354
- evaluator: testCaseEvaluatorKind,
331
+ evaluator: evalCaseEvaluatorKind,
355
332
  evaluators
356
333
  };
357
334
  if (verbose) {
358
335
  console.log(`
359
- [Test Case: ${id}]`);
336
+ [Eval Case: ${id}]`);
360
337
  if (testCase.guideline_paths.length > 0) {
361
338
  console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
362
339
  for (const guidelinePath of testCase.guideline_paths) {
@@ -415,7 +392,7 @@ ${body}`);
415
392
  }
416
393
  const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
417
394
  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
418
- return { question, guidelines, systemMessage: testCase.system_message };
395
+ return { question, guidelines };
419
396
  }
420
397
  async function fileExists2(absolutePath) {
421
398
  try {
@@ -2918,7 +2895,6 @@ var CodeEvaluator = class {
2918
2895
  expected_outcome: context.evalCase.expected_outcome,
2919
2896
  reference_answer: context.evalCase.reference_answer,
2920
2897
  candidate_answer: context.candidate,
2921
- system_message: context.promptInputs.systemMessage ?? "",
2922
2898
  guideline_paths: context.evalCase.guideline_paths,
2923
2899
  input_files: context.evalCase.file_paths,
2924
2900
  input_segments: context.evalCase.input_segments
@@ -3160,7 +3136,7 @@ function validateConcurrency(concurrency) {
3160
3136
  // src/evaluation/orchestrator.ts
3161
3137
  async function runEvaluation(options) {
3162
3138
  const {
3163
- testFilePath,
3139
+ testFilePath: evalFilePath,
3164
3140
  repoRoot,
3165
3141
  target,
3166
3142
  targets,
@@ -3179,11 +3155,11 @@ async function runEvaluation(options) {
3179
3155
  onProgress
3180
3156
  } = options;
3181
3157
  const load = loadEvalCases;
3182
- const evalCases = await load(testFilePath, repoRoot, { verbose });
3158
+ const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
3183
3159
  const filteredEvalCases = filterEvalCases(evalCases, evalId);
3184
3160
  if (filteredEvalCases.length === 0) {
3185
3161
  if (evalId) {
3186
- throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
3162
+ throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
3187
3163
  }
3188
3164
  return [];
3189
3165
  }
@@ -3562,8 +3538,7 @@ async function evaluateCandidate(options) {
3562
3538
  const rawRequest = {
3563
3539
  question: promptInputs.question,
3564
3540
  ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3565
- guideline_paths: evalCase.guideline_paths,
3566
- system_message: promptInputs.systemMessage ?? ""
3541
+ guideline_paths: evalCase.guideline_paths
3567
3542
  };
3568
3543
  return {
3569
3544
  eval_id: evalCase.id,
@@ -3827,7 +3802,6 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
3827
3802
  question: promptInputs.question,
3828
3803
  ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3829
3804
  guideline_paths: evalCase.guideline_paths,
3830
- system_message: promptInputs.systemMessage ?? "",
3831
3805
  error: message
3832
3806
  };
3833
3807
  return {