@agentv/core 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -91,7 +91,6 @@ type LlmJudgeEvaluatorConfig = {
91
91
  readonly type: "llm_judge";
92
92
  readonly prompt?: string;
93
93
  readonly promptPath?: string;
94
- readonly model?: string;
95
94
  };
96
95
  type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
97
96
  /**
@@ -264,6 +263,13 @@ interface TargetDefinition {
264
263
  readonly retryStatusCodes?: unknown | undefined;
265
264
  }
266
265
 
266
+ /**
267
+ * Read metadata from a test suite file (like target name).
268
+ * This is a convenience function for CLI tools that need metadata without loading all eval cases.
269
+ */
270
+ declare function readTestSuiteMetadata(testFilePath: string): Promise<{
271
+ target?: string;
272
+ }>;
267
273
  /**
268
274
  * Determine whether a path references guideline content (instructions or prompts).
269
275
  */
@@ -496,7 +502,6 @@ interface EvaluationContext {
496
502
  readonly judgeProvider?: Provider;
497
503
  readonly systemPrompt?: string;
498
504
  readonly evaluator?: EvaluatorConfig;
499
- readonly judgeModel?: string;
500
505
  }
501
506
  interface EvaluationScore {
502
507
  readonly score: number;
@@ -599,4 +604,4 @@ type AgentKernel = {
599
604
  };
600
605
  declare function createAgentKernel(): AgentKernel;
601
606
 
602
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
607
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
package/dist/index.d.ts CHANGED
@@ -91,7 +91,6 @@ type LlmJudgeEvaluatorConfig = {
91
91
  readonly type: "llm_judge";
92
92
  readonly prompt?: string;
93
93
  readonly promptPath?: string;
94
- readonly model?: string;
95
94
  };
96
95
  type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
97
96
  /**
@@ -264,6 +263,13 @@ interface TargetDefinition {
264
263
  readonly retryStatusCodes?: unknown | undefined;
265
264
  }
266
265
 
266
+ /**
267
+ * Read metadata from a test suite file (like target name).
268
+ * This is a convenience function for CLI tools that need metadata without loading all eval cases.
269
+ */
270
+ declare function readTestSuiteMetadata(testFilePath: string): Promise<{
271
+ target?: string;
272
+ }>;
267
273
  /**
268
274
  * Determine whether a path references guideline content (instructions or prompts).
269
275
  */
@@ -496,7 +502,6 @@ interface EvaluationContext {
496
502
  readonly judgeProvider?: Provider;
497
503
  readonly systemPrompt?: string;
498
504
  readonly evaluator?: EvaluatorConfig;
499
- readonly judgeModel?: string;
500
505
  }
501
506
  interface EvaluationScore {
502
507
  readonly score: number;
@@ -599,4 +604,4 @@ type AgentKernel = {
599
604
  };
600
605
  declare function createAgentKernel(): AgentKernel;
601
606
 
602
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
607
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, normalizeLineEndings, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
package/dist/index.js CHANGED
@@ -73,6 +73,33 @@ var ANSI_YELLOW = "\x1B[33m";
73
73
  var ANSI_RESET = "\x1B[0m";
74
74
  var SCHEMA_EVAL_V2 = "agentv-eval-v2";
75
75
  var SCHEMA_CONFIG_V2 = "agentv-config-v2";
76
+ async function readTestSuiteMetadata(testFilePath) {
77
+ try {
78
+ const absolutePath = path.resolve(testFilePath);
79
+ const content = await readFile(absolutePath, "utf8");
80
+ const parsed = parse(content);
81
+ if (!isJsonObject(parsed)) {
82
+ return {};
83
+ }
84
+ return { target: extractTargetFromSuite(parsed) };
85
+ } catch {
86
+ return {};
87
+ }
88
+ }
89
+ function extractTargetFromSuite(suite) {
90
+ const execution = suite.execution;
91
+ if (execution && typeof execution === "object" && !Array.isArray(execution)) {
92
+ const executionTarget = execution.target;
93
+ if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
94
+ return executionTarget.trim();
95
+ }
96
+ }
97
+ const targetValue = suite.target;
98
+ if (typeof targetValue === "string" && targetValue.trim().length > 0) {
99
+ return targetValue.trim();
100
+ }
101
+ return void 0;
102
+ }
76
103
  async function loadConfig(evalFilePath, repoRoot) {
77
104
  const directories = buildDirectoryChain(evalFilePath, repoRoot);
78
105
  for (const directory of directories) {
@@ -249,6 +276,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
249
276
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
250
277
  }
251
278
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
279
+ const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
280
+ const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
252
281
  const results = [];
253
282
  for (const rawEvalcase of rawTestcases) {
254
283
  if (!isJsonObject(rawEvalcase)) {
@@ -303,7 +332,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
303
332
  const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
304
333
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
305
334
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
306
- const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
335
+ const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
307
336
  const userFilePaths = [];
308
337
  for (const segment of inputSegments) {
309
338
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -389,14 +418,13 @@ function formatSegment(segment) {
389
418
  const text = asString(segment.text);
390
419
  const filePath = asString(segment.path);
391
420
  if (text && filePath) {
392
- return `=== ${filePath} ===
393
- ${text}`;
421
+ return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
394
422
  }
395
423
  }
396
424
  return void 0;
397
425
  }
398
426
  async function buildPromptInputs(testCase) {
399
- const guidelineContents = [];
427
+ const guidelineParts = [];
400
428
  for (const rawPath of testCase.guideline_paths) {
401
429
  const absolutePath = path.resolve(rawPath);
402
430
  if (!await fileExists2(absolutePath)) {
@@ -404,14 +432,17 @@ async function buildPromptInputs(testCase) {
404
432
  continue;
405
433
  }
406
434
  try {
407
- const content = (await readFile(absolutePath, "utf8")).replace(/\r\n/g, "\n");
408
- guidelineContents.push(`=== ${path.basename(absolutePath)} ===
409
- ${content}`);
435
+ const content = (await readFile(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
436
+ guidelineParts.push({
437
+ content,
438
+ isFile: true,
439
+ displayPath: path.basename(absolutePath)
440
+ });
410
441
  } catch (error) {
411
442
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
412
443
  }
413
444
  }
414
- const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
445
+ const guidelines = formatFileContents(guidelineParts);
415
446
  const segmentsByMessage = [];
416
447
  const fileContentsByPath = /* @__PURE__ */ new Map();
417
448
  for (const segment of testCase.input_segments) {
@@ -613,6 +644,20 @@ function cloneJsonValue(value) {
613
644
  }
614
645
  return cloneJsonObject(value);
615
646
  }
647
+ function formatFileContents(parts) {
648
+ const fileCount = parts.filter((p) => p.isFile).length;
649
+ if (fileCount > 0) {
650
+ return parts.map((part) => {
651
+ if (part.isFile && part.displayPath) {
652
+ return `<file path="${part.displayPath}">
653
+ ${part.content}
654
+ </file>`;
655
+ }
656
+ return part.content;
657
+ }).join("\n\n");
658
+ }
659
+ return parts.map((p) => p.content).join(" ");
660
+ }
616
661
  async function resolveAssistantContent(content, searchRoots, verbose) {
617
662
  if (typeof content === "string") {
618
663
  return content;
@@ -623,7 +668,7 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
623
668
  const parts = [];
624
669
  for (const entry of content) {
625
670
  if (typeof entry === "string") {
626
- parts.push(entry);
671
+ parts.push({ content: entry, isFile: false });
627
672
  continue;
628
673
  }
629
674
  if (!isJsonObject(entry)) {
@@ -645,8 +690,8 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
645
690
  continue;
646
691
  }
647
692
  try {
648
- const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
649
- parts.push(fileContent);
693
+ const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
694
+ parts.push({ content: fileContent, isFile: true, displayPath });
650
695
  if (verbose) {
651
696
  console.log(` [Expected Assistant File] Found: ${displayPath}`);
652
697
  console.log(` Resolved to: ${resolvedPath}`);
@@ -658,21 +703,21 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
658
703
  }
659
704
  const textValue = asString(entry.text);
660
705
  if (typeof textValue === "string") {
661
- parts.push(textValue);
706
+ parts.push({ content: textValue, isFile: false });
662
707
  continue;
663
708
  }
664
709
  const valueValue = asString(entry.value);
665
710
  if (typeof valueValue === "string") {
666
- parts.push(valueValue);
711
+ parts.push({ content: valueValue, isFile: false });
667
712
  continue;
668
713
  }
669
- parts.push(JSON.stringify(entry));
714
+ parts.push({ content: JSON.stringify(entry), isFile: false });
670
715
  }
671
- return parts.join(" ");
716
+ return formatFileContents(parts);
672
717
  }
673
- async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
718
+ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
674
719
  const execution = rawEvalCase.execution;
675
- const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
720
+ const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
676
721
  if (candidateEvaluators === void 0) {
677
722
  return void 0;
678
723
  }
@@ -710,6 +755,8 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
710
755
  resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
711
756
  );
712
757
  }
758
+ } else {
759
+ resolvedCwd = searchRoots[0];
713
760
  }
714
761
  evaluators.push({
715
762
  name,
@@ -738,8 +785,7 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
738
785
  name,
739
786
  type: "llm_judge",
740
787
  prompt,
741
- promptPath,
742
- model
788
+ promptPath
743
789
  });
744
790
  }
745
791
  return evaluators.length > 0 ? evaluators : void 0;
@@ -2532,10 +2578,7 @@ var LlmJudgeEvaluator = class {
2532
2578
  prompt = substituteVariables(systemPrompt, variables);
2533
2579
  systemPrompt = buildSystemPrompt(hasReferenceAnswer);
2534
2580
  }
2535
- const metadata = {
2536
- ...systemPrompt !== void 0 ? { systemPrompt } : {},
2537
- ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
2538
- };
2581
+ const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
2539
2582
  const response = await judgeProvider.invoke({
2540
2583
  question: prompt,
2541
2584
  metadata,
@@ -2555,8 +2598,7 @@ var LlmJudgeEvaluator = class {
2555
2598
  provider: judgeProvider.id,
2556
2599
  prompt,
2557
2600
  target: context.target.name,
2558
- ...systemPrompt !== void 0 ? { systemPrompt } : {},
2559
- ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
2601
+ ...systemPrompt !== void 0 && { systemPrompt }
2560
2602
  };
2561
2603
  return {
2562
2604
  score,
@@ -3550,8 +3592,7 @@ async function runLlmJudgeEvaluator(options) {
3550
3592
  now,
3551
3593
  judgeProvider,
3552
3594
  systemPrompt: customPrompt,
3553
- evaluator: config,
3554
- judgeModel: config.model
3595
+ evaluator: config
3555
3596
  });
3556
3597
  }
3557
3598
  async function resolveCustomPrompt(config) {
@@ -3736,6 +3777,7 @@ export {
3736
3777
  loadEvalCases,
3737
3778
  normalizeLineEndings,
3738
3779
  readTargetDefinitions,
3780
+ readTestSuiteMetadata,
3739
3781
  readTextFile,
3740
3782
  resolveAndCreateProvider,
3741
3783
  resolveFileReference,