@agentv/core 0.7.2 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -382,6 +382,7 @@ async function processMessages(options) {
382
382
  }
383
383
  async function loadEvalCases(evalFilePath, repoRoot, options) {
384
384
  const verbose = options?.verbose ?? false;
385
+ const evalIdFilter = options?.evalId;
385
386
  const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
386
387
  if (!await fileExists2(absoluteTestPath)) {
387
388
  throw new Error(`Test file not found: ${evalFilePath}`);
@@ -413,62 +414,39 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
413
414
  const results = [];
414
415
  for (const rawEvalcase of rawTestcases) {
415
416
  if (!isJsonObject(rawEvalcase)) {
416
- logWarning("Skipping invalid test case entry (expected object)");
417
+ logWarning("Skipping invalid eval case entry (expected object)");
417
418
  continue;
418
419
  }
419
420
  const evalcase = rawEvalcase;
420
421
  const id = asString(evalcase.id);
422
+ if (evalIdFilter && id !== evalIdFilter) {
423
+ continue;
424
+ }
421
425
  const conversationId = asString(evalcase.conversation_id);
422
426
  const outcome = asString(evalcase.outcome);
423
427
  const inputMessagesValue = evalcase.input_messages;
424
428
  const expectedMessagesValue = evalcase.expected_messages;
425
429
  if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
426
- logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
430
+ logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
427
431
  continue;
428
432
  }
429
433
  if (!Array.isArray(expectedMessagesValue)) {
430
- logWarning(`Test case '${id}' missing expected_messages array`);
434
+ logWarning(`Eval case '${id}' missing expected_messages array`);
431
435
  continue;
432
436
  }
433
437
  const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
434
438
  const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
435
- const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
436
- const userMessages = inputMessages.filter((message) => message.role === "user");
437
- const systemMessages = inputMessages.filter((message) => message.role === "system");
438
- if (assistantMessages.length === 0) {
439
- logWarning(`No assistant message found for test case: ${id}`);
439
+ if (expectedMessages.length === 0) {
440
+ logWarning(`No expected message found for eval case: ${id}`);
440
441
  continue;
441
442
  }
442
- if (assistantMessages.length > 1) {
443
- logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
444
- }
445
- if (systemMessages.length > 1) {
446
- logWarning(`Multiple system messages found for test case: ${id}, using first`);
447
- }
448
- let systemMessageContent;
449
- if (systemMessages.length > 0) {
450
- const content = systemMessages[0]?.content;
451
- if (typeof content === "string") {
452
- systemMessageContent = content;
453
- } else if (Array.isArray(content)) {
454
- const textParts = [];
455
- for (const segment of content) {
456
- if (isJsonObject(segment)) {
457
- const value = segment.value;
458
- if (typeof value === "string") {
459
- textParts.push(value);
460
- }
461
- }
462
- }
463
- if (textParts.length > 0) {
464
- systemMessageContent = textParts.join("\n\n");
465
- }
466
- }
443
+ if (expectedMessages.length > 1) {
444
+ logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
467
445
  }
468
446
  const guidelinePaths = [];
469
447
  const inputTextParts = [];
470
448
  const inputSegments = await processMessages({
471
- messages: userMessages,
449
+ messages: inputMessages,
472
450
  searchRoots,
473
451
  repoRootPath,
474
452
  guidelinePatterns,
@@ -478,7 +456,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
478
456
  verbose
479
457
  });
480
458
  const outputSegments = await processMessages({
481
- messages: assistantMessages,
459
+ messages: expectedMessages,
482
460
  searchRoots,
483
461
  repoRootPath,
484
462
  guidelinePatterns,
@@ -486,10 +464,10 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
486
464
  verbose
487
465
  });
488
466
  const codeSnippets = extractCodeBlocks(inputSegments);
489
- const assistantContent = assistantMessages[0]?.content;
490
- const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
467
+ const expectedContent = expectedMessages[0]?.content;
468
+ const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
491
469
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
492
- const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
470
+ const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
493
471
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
494
472
  const userFilePaths = [];
495
473
  for (const segment of inputSegments) {
@@ -508,19 +486,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
508
486
  question,
509
487
  input_segments: inputSegments,
510
488
  output_segments: outputSegments,
511
- system_message: systemMessageContent,
512
489
  reference_answer: referenceAnswer,
513
490
  guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
514
491
  guideline_patterns: guidelinePatterns,
515
492
  file_paths: allFilePaths,
516
493
  code_snippets: codeSnippets,
517
494
  expected_outcome: outcome,
518
- evaluator: testCaseEvaluatorKind,
495
+ evaluator: evalCaseEvaluatorKind,
519
496
  evaluators
520
497
  };
521
498
  if (verbose) {
522
499
  console.log(`
523
- [Test Case: ${id}]`);
500
+ [Eval Case: ${id}]`);
524
501
  if (testCase.guideline_paths.length > 0) {
525
502
  console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
526
503
  for (const guidelinePath of testCase.guideline_paths) {
@@ -579,7 +556,7 @@ ${body}`);
579
556
  }
580
557
  const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
581
558
  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
582
- return { question, guidelines, systemMessage: testCase.system_message };
559
+ return { question, guidelines };
583
560
  }
584
561
  async function fileExists2(absolutePath) {
585
562
  try {
@@ -965,6 +942,8 @@ var GeminiProvider = class {
965
942
 
966
943
  // src/evaluation/providers/cli.ts
967
944
  var import_node_child_process = require("child_process");
945
+ var import_promises3 = __toESM(require("fs/promises"), 1);
946
+ var import_node_os = __toESM(require("os"), 1);
968
947
  var import_node_path3 = __toESM(require("path"), 1);
969
948
  var import_node_util = require("util");
970
949
  var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
@@ -980,6 +959,7 @@ async function defaultCommandRunner(command, options) {
980
959
  };
981
960
  try {
982
961
  const { stdout, stderr } = await execAsync(command, execOptions);
962
+ console.error(`[CLI DEBUG] SUCCESS - stdout: ${stdout.length} bytes, stderr: ${stderr.length} bytes`);
983
963
  return {
984
964
  stdout,
985
965
  stderr,
@@ -990,6 +970,8 @@ async function defaultCommandRunner(command, options) {
990
970
  };
991
971
  } catch (error) {
992
972
  const execError = error;
973
+ console.error(`[CLI DEBUG] ERROR - code: ${execError.code}, message: ${execError.message}`);
974
+ console.error(`[CLI DEBUG] stdout: ${execError.stdout?.length ?? 0} bytes, stderr: ${execError.stderr?.length ?? 0} bytes`);
993
975
  return {
994
976
  stdout: execError.stdout ?? "",
995
977
  stderr: execError.stderr ?? "",
@@ -1019,7 +1001,8 @@ var CliProvider = class {
1019
1001
  throw new Error("CLI provider request was aborted before execution");
1020
1002
  }
1021
1003
  await this.ensureHealthy(request.signal);
1022
- const templateValues = buildTemplateValues(request, this.config);
1004
+ const outputFilePath = generateOutputFilePath(request.evalCaseId);
1005
+ const templateValues = buildTemplateValues(request, this.config, outputFilePath);
1023
1006
  const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
1024
1007
  const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
1025
1008
  const result = await this.runCommand(renderedCommand, {
@@ -1042,16 +1025,30 @@ var CliProvider = class {
1042
1025
  const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
1043
1026
  throw new Error(message);
1044
1027
  }
1028
+ const responseText = await this.readAndCleanupOutputFile(outputFilePath);
1045
1029
  return {
1046
- text: result.stdout,
1030
+ text: responseText,
1047
1031
  raw: {
1048
1032
  command: renderedCommand,
1049
1033
  stderr: result.stderr,
1050
1034
  exitCode: result.exitCode ?? 0,
1051
- cwd: this.config.cwd
1035
+ cwd: this.config.cwd,
1036
+ outputFile: outputFilePath
1052
1037
  }
1053
1038
  };
1054
1039
  }
1040
+ async readAndCleanupOutputFile(filePath) {
1041
+ try {
1042
+ const content = await import_promises3.default.readFile(filePath, "utf-8");
1043
+ return content;
1044
+ } catch (error) {
1045
+ const errorMsg = error instanceof Error ? error.message : String(error);
1046
+ throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
1047
+ } finally {
1048
+ await import_promises3.default.unlink(filePath).catch(() => {
1049
+ });
1050
+ }
1051
+ }
1055
1052
  async ensureHealthy(signal) {
1056
1053
  if (!this.config.healthcheck) {
1057
1054
  return;
@@ -1092,10 +1089,11 @@ var CliProvider = class {
1092
1089
  question: "",
1093
1090
  guidelines: "",
1094
1091
  inputFiles: [],
1095
- evalCaseId: "",
1092
+ evalCaseId: "healthcheck",
1096
1093
  attempt: 0
1097
1094
  },
1098
- this.config
1095
+ this.config,
1096
+ generateOutputFilePath("healthcheck")
1099
1097
  )
1100
1098
  );
1101
1099
  const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
@@ -1113,14 +1111,15 @@ var CliProvider = class {
1113
1111
  }
1114
1112
  }
1115
1113
  };
1116
- function buildTemplateValues(request, config) {
1114
+ function buildTemplateValues(request, config, outputFilePath) {
1117
1115
  const inputFiles = normalizeInputFiles(request.inputFiles);
1118
1116
  return {
1119
1117
  PROMPT: shellEscape(request.question ?? ""),
1120
1118
  GUIDELINES: shellEscape(request.guidelines ?? ""),
1121
1119
  EVAL_ID: shellEscape(request.evalCaseId ?? ""),
1122
1120
  ATTEMPT: shellEscape(String(request.attempt ?? 0)),
1123
- FILES: formatFileList(inputFiles, config.filesFormat)
1121
+ FILES: formatFileList(inputFiles, config.filesFormat),
1122
+ OUTPUT_FILE: shellEscape(outputFilePath)
1124
1123
  };
1125
1124
  }
1126
1125
  function normalizeInputFiles(inputFiles) {
@@ -1158,11 +1157,17 @@ function shellEscape(value) {
1158
1157
  return "''";
1159
1158
  }
1160
1159
  if (process.platform === "win32") {
1161
- const escaped = value.replace(/"/g, '\\"');
1162
- return `"${escaped}"`;
1160
+ const escaped = value.replace(/'/g, "''");
1161
+ return `'${escaped}'`;
1163
1162
  }
1164
1163
  return `'${value.replace(/'/g, `'"'"'`)}'`;
1165
1164
  }
1165
+ function generateOutputFilePath(evalCaseId) {
1166
+ const safeEvalId = evalCaseId || "unknown";
1167
+ const timestamp = Date.now();
1168
+ const random = Math.random().toString(36).substring(2, 9);
1169
+ return import_node_path3.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
1170
+ }
1166
1171
  function formatTimeoutSuffix(timeoutMs) {
1167
1172
  if (!timeoutMs || timeoutMs <= 0) {
1168
1173
  return "";
@@ -1175,8 +1180,8 @@ function formatTimeoutSuffix(timeoutMs) {
1175
1180
  var import_node_child_process2 = require("child_process");
1176
1181
  var import_node_crypto = require("crypto");
1177
1182
  var import_node_fs3 = require("fs");
1178
- var import_promises3 = require("fs/promises");
1179
- var import_node_os = require("os");
1183
+ var import_promises4 = require("fs/promises");
1184
+ var import_node_os2 = require("os");
1180
1185
  var import_node_path5 = __toESM(require("path"), 1);
1181
1186
  var import_node_util2 = require("util");
1182
1187
 
@@ -1365,7 +1370,7 @@ var CodexProvider = class {
1365
1370
  try {
1366
1371
  const promptContent = buildPromptDocument(request, inputFiles);
1367
1372
  const promptFile = import_node_path5.default.join(workspaceRoot, PROMPT_FILENAME);
1368
- await (0, import_promises3.writeFile)(promptFile, promptContent, "utf8");
1373
+ await (0, import_promises4.writeFile)(promptFile, promptContent, "utf8");
1369
1374
  const args = this.buildCodexArgs();
1370
1375
  const cwd = this.resolveCwd(workspaceRoot);
1371
1376
  const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
@@ -1448,11 +1453,11 @@ var CodexProvider = class {
1448
1453
  }
1449
1454
  }
1450
1455
  async createWorkspace() {
1451
- return await (0, import_promises3.mkdtemp)(import_node_path5.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
1456
+ return await (0, import_promises4.mkdtemp)(import_node_path5.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
1452
1457
  }
1453
1458
  async cleanupWorkspace(workspaceRoot) {
1454
1459
  try {
1455
- await (0, import_promises3.rm)(workspaceRoot, { recursive: true, force: true });
1460
+ await (0, import_promises4.rm)(workspaceRoot, { recursive: true, force: true });
1456
1461
  } catch {
1457
1462
  }
1458
1463
  }
@@ -1472,7 +1477,7 @@ var CodexProvider = class {
1472
1477
  return void 0;
1473
1478
  }
1474
1479
  try {
1475
- await (0, import_promises3.mkdir)(logDir, { recursive: true });
1480
+ await (0, import_promises4.mkdir)(logDir, { recursive: true });
1476
1481
  } catch (error) {
1477
1482
  const message = error instanceof Error ? error.message : String(error);
1478
1483
  console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
@@ -1695,7 +1700,7 @@ async function locateExecutable(candidate) {
1695
1700
  if (includesPathSeparator) {
1696
1701
  const resolved = import_node_path5.default.isAbsolute(candidate) ? candidate : import_node_path5.default.resolve(candidate);
1697
1702
  const executablePath = await ensureWindowsExecutableVariant(resolved);
1698
- await (0, import_promises3.access)(executablePath, import_node_fs3.constants.F_OK);
1703
+ await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
1699
1704
  return executablePath;
1700
1705
  }
1701
1706
  const locator = process.platform === "win32" ? "where" : "which";
@@ -1705,7 +1710,7 @@ async function locateExecutable(candidate) {
1705
1710
  const preferred = selectExecutableCandidate(lines);
1706
1711
  if (preferred) {
1707
1712
  const executablePath = await ensureWindowsExecutableVariant(preferred);
1708
- await (0, import_promises3.access)(executablePath, import_node_fs3.constants.F_OK);
1713
+ await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
1709
1714
  return executablePath;
1710
1715
  }
1711
1716
  } catch {
@@ -1739,7 +1744,7 @@ async function ensureWindowsExecutableVariant(candidate) {
1739
1744
  for (const ext of extensions) {
1740
1745
  const withExtension = `${candidate}${ext}`;
1741
1746
  try {
1742
- await (0, import_promises3.access)(withExtension, import_node_fs3.constants.F_OK);
1747
+ await (0, import_promises4.access)(withExtension, import_node_fs3.constants.F_OK);
1743
1748
  return withExtension;
1744
1749
  } catch {
1745
1750
  }
@@ -2041,7 +2046,7 @@ var MockProvider = class {
2041
2046
 
2042
2047
  // src/evaluation/providers/targets.ts
2043
2048
  var import_zod = require("zod");
2044
- var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID", "ATTEMPT", "FILES"]);
2049
+ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID", "ATTEMPT", "FILES", "OUTPUT_FILE"]);
2045
2050
  var BASE_TARGET_SCHEMA = import_zod.z.object({
2046
2051
  name: import_zod.z.string().min(1, "target name is required"),
2047
2052
  provider: import_zod.z.string().min(1, "provider is required"),
@@ -2768,7 +2773,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
2768
2773
 
2769
2774
  // src/evaluation/providers/targets-file.ts
2770
2775
  var import_node_fs4 = require("fs");
2771
- var import_promises4 = require("fs/promises");
2776
+ var import_promises5 = require("fs/promises");
2772
2777
  var import_node_path7 = __toESM(require("path"), 1);
2773
2778
  var import_yaml2 = require("yaml");
2774
2779
 
@@ -2838,7 +2843,7 @@ function assertTargetDefinition(value, index, filePath) {
2838
2843
  }
2839
2844
  async function fileExists3(filePath) {
2840
2845
  try {
2841
- await (0, import_promises4.access)(filePath, import_node_fs4.constants.F_OK);
2846
+ await (0, import_promises5.access)(filePath, import_node_fs4.constants.F_OK);
2842
2847
  return true;
2843
2848
  } catch {
2844
2849
  return false;
@@ -2849,7 +2854,7 @@ async function readTargetDefinitions(filePath) {
2849
2854
  if (!await fileExists3(absolutePath)) {
2850
2855
  throw new Error(`targets.yaml not found at ${absolutePath}`);
2851
2856
  }
2852
- const raw = await (0, import_promises4.readFile)(absolutePath, "utf8");
2857
+ const raw = await (0, import_promises5.readFile)(absolutePath, "utf8");
2853
2858
  const parsed = (0, import_yaml2.parse)(raw);
2854
2859
  if (!isRecord(parsed)) {
2855
2860
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
@@ -3095,7 +3100,6 @@ var CodeEvaluator = class {
3095
3100
  expected_outcome: context.evalCase.expected_outcome,
3096
3101
  reference_answer: context.evalCase.reference_answer,
3097
3102
  candidate_answer: context.candidate,
3098
- system_message: context.promptInputs.systemMessage ?? "",
3099
3103
  guideline_paths: context.evalCase.guideline_paths,
3100
3104
  input_files: context.evalCase.file_paths,
3101
3105
  input_segments: context.evalCase.input_segments
@@ -3195,7 +3199,7 @@ function substituteVariables(template, variables) {
3195
3199
 
3196
3200
  // src/evaluation/orchestrator.ts
3197
3201
  var import_node_crypto3 = require("crypto");
3198
- var import_promises5 = require("fs/promises");
3202
+ var import_promises6 = require("fs/promises");
3199
3203
  var import_node_path8 = __toESM(require("path"), 1);
3200
3204
 
3201
3205
  // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
@@ -3337,7 +3341,7 @@ function validateConcurrency(concurrency) {
3337
3341
  // src/evaluation/orchestrator.ts
3338
3342
  async function runEvaluation(options) {
3339
3343
  const {
3340
- testFilePath,
3344
+ testFilePath: evalFilePath,
3341
3345
  repoRoot,
3342
3346
  target,
3343
3347
  targets,
@@ -3356,11 +3360,11 @@ async function runEvaluation(options) {
3356
3360
  onProgress
3357
3361
  } = options;
3358
3362
  const load = loadEvalCases;
3359
- const evalCases = await load(testFilePath, repoRoot, { verbose });
3363
+ const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
3360
3364
  const filteredEvalCases = filterEvalCases(evalCases, evalId);
3361
3365
  if (filteredEvalCases.length === 0) {
3362
3366
  if (evalId) {
3363
- throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
3367
+ throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
3364
3368
  }
3365
3369
  return [];
3366
3370
  }
@@ -3739,8 +3743,7 @@ async function evaluateCandidate(options) {
3739
3743
  const rawRequest = {
3740
3744
  question: promptInputs.question,
3741
3745
  ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3742
- guideline_paths: evalCase.guideline_paths,
3743
- system_message: promptInputs.systemMessage ?? ""
3746
+ guideline_paths: evalCase.guideline_paths
3744
3747
  };
3745
3748
  return {
3746
3749
  eval_id: evalCase.id,
@@ -3956,14 +3959,14 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
3956
3959
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3957
3960
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
3958
3961
  const filePath = import_node_path8.default.resolve(directory, filename);
3959
- await (0, import_promises5.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
3962
+ await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
3960
3963
  const payload = {
3961
3964
  eval_id: evalCase.id,
3962
3965
  question: promptInputs.question,
3963
3966
  guidelines: promptInputs.guidelines,
3964
3967
  guideline_paths: evalCase.guideline_paths
3965
3968
  };
3966
- await (0, import_promises5.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
3969
+ await (0, import_promises6.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
3967
3970
  }
3968
3971
  function sanitizeFilename(value) {
3969
3972
  if (!value) {
@@ -4004,7 +4007,6 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
4004
4007
  question: promptInputs.question,
4005
4008
  ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
4006
4009
  guideline_paths: evalCase.guideline_paths,
4007
- system_message: promptInputs.systemMessage ?? "",
4008
4010
  error: message
4009
4011
  };
4010
4012
  return {