@agentv/core 0.7.2 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -6,8 +6,9 @@ import {
6
6
  findGitRoot,
7
7
  isAgentProvider,
8
8
  readTextFile,
9
- resolveFileReference
10
- } from "./chunk-UQLHF3T7.js";
9
+ resolveFileReference,
10
+ resolveTargetDefinition
11
+ } from "./chunk-L6RCDZ4Z.js";
11
12
 
12
13
  // src/evaluation/types.ts
13
14
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -218,6 +219,7 @@ async function processMessages(options) {
218
219
  }
219
220
  async function loadEvalCases(evalFilePath, repoRoot, options) {
220
221
  const verbose = options?.verbose ?? false;
222
+ const evalIdFilter = options?.evalId;
221
223
  const absoluteTestPath = path.resolve(evalFilePath);
222
224
  if (!await fileExists2(absoluteTestPath)) {
223
225
  throw new Error(`Test file not found: ${evalFilePath}`);
@@ -249,62 +251,39 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
249
251
  const results = [];
250
252
  for (const rawEvalcase of rawTestcases) {
251
253
  if (!isJsonObject(rawEvalcase)) {
252
- logWarning("Skipping invalid test case entry (expected object)");
254
+ logWarning("Skipping invalid eval case entry (expected object)");
253
255
  continue;
254
256
  }
255
257
  const evalcase = rawEvalcase;
256
258
  const id = asString(evalcase.id);
259
+ if (evalIdFilter && id !== evalIdFilter) {
260
+ continue;
261
+ }
257
262
  const conversationId = asString(evalcase.conversation_id);
258
263
  const outcome = asString(evalcase.outcome);
259
264
  const inputMessagesValue = evalcase.input_messages;
260
265
  const expectedMessagesValue = evalcase.expected_messages;
261
266
  if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
262
- logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
267
+ logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
263
268
  continue;
264
269
  }
265
270
  if (!Array.isArray(expectedMessagesValue)) {
266
- logWarning(`Test case '${id}' missing expected_messages array`);
271
+ logWarning(`Eval case '${id}' missing expected_messages array`);
267
272
  continue;
268
273
  }
269
274
  const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
270
275
  const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
271
- const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
272
- const userMessages = inputMessages.filter((message) => message.role === "user");
273
- const systemMessages = inputMessages.filter((message) => message.role === "system");
274
- if (assistantMessages.length === 0) {
275
- logWarning(`No assistant message found for test case: ${id}`);
276
+ if (expectedMessages.length === 0) {
277
+ logWarning(`No expected message found for eval case: ${id}`);
276
278
  continue;
277
279
  }
278
- if (assistantMessages.length > 1) {
279
- logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
280
- }
281
- if (systemMessages.length > 1) {
282
- logWarning(`Multiple system messages found for test case: ${id}, using first`);
283
- }
284
- let systemMessageContent;
285
- if (systemMessages.length > 0) {
286
- const content = systemMessages[0]?.content;
287
- if (typeof content === "string") {
288
- systemMessageContent = content;
289
- } else if (Array.isArray(content)) {
290
- const textParts = [];
291
- for (const segment of content) {
292
- if (isJsonObject(segment)) {
293
- const value = segment.value;
294
- if (typeof value === "string") {
295
- textParts.push(value);
296
- }
297
- }
298
- }
299
- if (textParts.length > 0) {
300
- systemMessageContent = textParts.join("\n\n");
301
- }
302
- }
280
+ if (expectedMessages.length > 1) {
281
+ logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
303
282
  }
304
283
  const guidelinePaths = [];
305
284
  const inputTextParts = [];
306
285
  const inputSegments = await processMessages({
307
- messages: userMessages,
286
+ messages: inputMessages,
308
287
  searchRoots,
309
288
  repoRootPath,
310
289
  guidelinePatterns,
@@ -314,7 +293,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
314
293
  verbose
315
294
  });
316
295
  const outputSegments = await processMessages({
317
- messages: assistantMessages,
296
+ messages: expectedMessages,
318
297
  searchRoots,
319
298
  repoRootPath,
320
299
  guidelinePatterns,
@@ -322,10 +301,10 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
322
301
  verbose
323
302
  });
324
303
  const codeSnippets = extractCodeBlocks(inputSegments);
325
- const assistantContent = assistantMessages[0]?.content;
326
- const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
304
+ const expectedContent = expectedMessages[0]?.content;
305
+ const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
327
306
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
328
- const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
307
+ const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
329
308
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
330
309
  const userFilePaths = [];
331
310
  for (const segment of inputSegments) {
@@ -344,19 +323,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
344
323
  question,
345
324
  input_segments: inputSegments,
346
325
  output_segments: outputSegments,
347
- system_message: systemMessageContent,
348
326
  reference_answer: referenceAnswer,
349
327
  guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
350
328
  guideline_patterns: guidelinePatterns,
351
329
  file_paths: allFilePaths,
352
330
  code_snippets: codeSnippets,
353
331
  expected_outcome: outcome,
354
- evaluator: testCaseEvaluatorKind,
332
+ evaluator: evalCaseEvaluatorKind,
355
333
  evaluators
356
334
  };
357
335
  if (verbose) {
358
336
  console.log(`
359
- [Test Case: ${id}]`);
337
+ [Eval Case: ${id}]`);
360
338
  if (testCase.guideline_paths.length > 0) {
361
339
  console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
362
340
  for (const guidelinePath of testCase.guideline_paths) {
@@ -415,7 +393,7 @@ ${body}`);
415
393
  }
416
394
  const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
417
395
  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
418
- return { question, guidelines, systemMessage: testCase.system_message };
396
+ return { question, guidelines };
419
397
  }
420
398
  async function fileExists2(absolutePath) {
421
399
  try {
@@ -801,6 +779,8 @@ var GeminiProvider = class {
801
779
 
802
780
  // src/evaluation/providers/cli.ts
803
781
  import { exec as execWithCallback } from "node:child_process";
782
+ import fs from "node:fs/promises";
783
+ import os from "node:os";
804
784
  import path2 from "node:path";
805
785
  import { promisify } from "node:util";
806
786
  var execAsync = promisify(execWithCallback);
@@ -816,6 +796,7 @@ async function defaultCommandRunner(command, options) {
816
796
  };
817
797
  try {
818
798
  const { stdout, stderr } = await execAsync(command, execOptions);
799
+ console.error(`[CLI DEBUG] SUCCESS - stdout: ${stdout.length} bytes, stderr: ${stderr.length} bytes`);
819
800
  return {
820
801
  stdout,
821
802
  stderr,
@@ -826,6 +807,8 @@ async function defaultCommandRunner(command, options) {
826
807
  };
827
808
  } catch (error) {
828
809
  const execError = error;
810
+ console.error(`[CLI DEBUG] ERROR - code: ${execError.code}, message: ${execError.message}`);
811
+ console.error(`[CLI DEBUG] stdout: ${execError.stdout?.length ?? 0} bytes, stderr: ${execError.stderr?.length ?? 0} bytes`);
829
812
  return {
830
813
  stdout: execError.stdout ?? "",
831
814
  stderr: execError.stderr ?? "",
@@ -855,7 +838,8 @@ var CliProvider = class {
855
838
  throw new Error("CLI provider request was aborted before execution");
856
839
  }
857
840
  await this.ensureHealthy(request.signal);
858
- const templateValues = buildTemplateValues(request, this.config);
841
+ const outputFilePath = generateOutputFilePath(request.evalCaseId);
842
+ const templateValues = buildTemplateValues(request, this.config, outputFilePath);
859
843
  const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
860
844
  const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
861
845
  const result = await this.runCommand(renderedCommand, {
@@ -878,16 +862,30 @@ var CliProvider = class {
878
862
  const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
879
863
  throw new Error(message);
880
864
  }
865
+ const responseText = await this.readAndCleanupOutputFile(outputFilePath);
881
866
  return {
882
- text: result.stdout,
867
+ text: responseText,
883
868
  raw: {
884
869
  command: renderedCommand,
885
870
  stderr: result.stderr,
886
871
  exitCode: result.exitCode ?? 0,
887
- cwd: this.config.cwd
872
+ cwd: this.config.cwd,
873
+ outputFile: outputFilePath
888
874
  }
889
875
  };
890
876
  }
877
+ async readAndCleanupOutputFile(filePath) {
878
+ try {
879
+ const content = await fs.readFile(filePath, "utf-8");
880
+ return content;
881
+ } catch (error) {
882
+ const errorMsg = error instanceof Error ? error.message : String(error);
883
+ throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
884
+ } finally {
885
+ await fs.unlink(filePath).catch(() => {
886
+ });
887
+ }
888
+ }
891
889
  async ensureHealthy(signal) {
892
890
  if (!this.config.healthcheck) {
893
891
  return;
@@ -928,10 +926,11 @@ var CliProvider = class {
928
926
  question: "",
929
927
  guidelines: "",
930
928
  inputFiles: [],
931
- evalCaseId: "",
929
+ evalCaseId: "healthcheck",
932
930
  attempt: 0
933
931
  },
934
- this.config
932
+ this.config,
933
+ generateOutputFilePath("healthcheck")
935
934
  )
936
935
  );
937
936
  const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
@@ -949,14 +948,15 @@ var CliProvider = class {
949
948
  }
950
949
  }
951
950
  };
952
- function buildTemplateValues(request, config) {
951
+ function buildTemplateValues(request, config, outputFilePath) {
953
952
  const inputFiles = normalizeInputFiles(request.inputFiles);
954
953
  return {
955
954
  PROMPT: shellEscape(request.question ?? ""),
956
955
  GUIDELINES: shellEscape(request.guidelines ?? ""),
957
956
  EVAL_ID: shellEscape(request.evalCaseId ?? ""),
958
957
  ATTEMPT: shellEscape(String(request.attempt ?? 0)),
959
- FILES: formatFileList(inputFiles, config.filesFormat)
958
+ FILES: formatFileList(inputFiles, config.filesFormat),
959
+ OUTPUT_FILE: shellEscape(outputFilePath)
960
960
  };
961
961
  }
962
962
  function normalizeInputFiles(inputFiles) {
@@ -994,11 +994,17 @@ function shellEscape(value) {
994
994
  return "''";
995
995
  }
996
996
  if (process.platform === "win32") {
997
- const escaped = value.replace(/"/g, '\\"');
998
- return `"${escaped}"`;
997
+ const escaped = value.replace(/'/g, "''");
998
+ return `'${escaped}'`;
999
999
  }
1000
1000
  return `'${value.replace(/'/g, `'"'"'`)}'`;
1001
1001
  }
1002
+ function generateOutputFilePath(evalCaseId) {
1003
+ const safeEvalId = evalCaseId || "unknown";
1004
+ const timestamp = Date.now();
1005
+ const random = Math.random().toString(36).substring(2, 9);
1006
+ return path2.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
1007
+ }
1002
1008
  function formatTimeoutSuffix(timeoutMs) {
1003
1009
  if (!timeoutMs || timeoutMs <= 0) {
1004
1010
  return "";
@@ -1875,487 +1881,6 @@ var MockProvider = class {
1875
1881
  }
1876
1882
  };
1877
1883
 
1878
- // src/evaluation/providers/targets.ts
1879
- import { z } from "zod";
1880
- var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID", "ATTEMPT", "FILES"]);
1881
- var BASE_TARGET_SCHEMA = z.object({
1882
- name: z.string().min(1, "target name is required"),
1883
- provider: z.string().min(1, "provider is required"),
1884
- settings: z.record(z.unknown()).optional(),
1885
- judge_target: z.string().optional(),
1886
- workers: z.number().int().min(1).optional()
1887
- });
1888
- var DEFAULT_AZURE_API_VERSION = "2024-10-01-preview";
1889
- function normalizeAzureApiVersion(value) {
1890
- if (!value) {
1891
- return DEFAULT_AZURE_API_VERSION;
1892
- }
1893
- const trimmed = value.trim();
1894
- if (trimmed.length === 0) {
1895
- return DEFAULT_AZURE_API_VERSION;
1896
- }
1897
- const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
1898
- return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
1899
- }
1900
- function resolveTargetDefinition(definition, env = process.env) {
1901
- const parsed = BASE_TARGET_SCHEMA.parse(definition);
1902
- const provider = parsed.provider.toLowerCase();
1903
- const providerBatching = resolveOptionalBoolean(
1904
- parsed.settings?.provider_batching ?? parsed.settings?.providerBatching
1905
- );
1906
- switch (provider) {
1907
- case "azure":
1908
- case "azure-openai":
1909
- return {
1910
- kind: "azure",
1911
- name: parsed.name,
1912
- judgeTarget: parsed.judge_target,
1913
- workers: parsed.workers,
1914
- providerBatching,
1915
- config: resolveAzureConfig(parsed, env)
1916
- };
1917
- case "anthropic":
1918
- return {
1919
- kind: "anthropic",
1920
- name: parsed.name,
1921
- judgeTarget: parsed.judge_target,
1922
- workers: parsed.workers,
1923
- providerBatching,
1924
- config: resolveAnthropicConfig(parsed, env)
1925
- };
1926
- case "gemini":
1927
- case "google":
1928
- case "google-gemini":
1929
- return {
1930
- kind: "gemini",
1931
- name: parsed.name,
1932
- judgeTarget: parsed.judge_target,
1933
- workers: parsed.workers,
1934
- providerBatching,
1935
- config: resolveGeminiConfig(parsed, env)
1936
- };
1937
- case "codex":
1938
- case "codex-cli":
1939
- return {
1940
- kind: "codex",
1941
- name: parsed.name,
1942
- judgeTarget: parsed.judge_target,
1943
- workers: parsed.workers,
1944
- providerBatching,
1945
- config: resolveCodexConfig(parsed, env)
1946
- };
1947
- case "mock":
1948
- return {
1949
- kind: "mock",
1950
- name: parsed.name,
1951
- judgeTarget: parsed.judge_target,
1952
- workers: parsed.workers,
1953
- providerBatching,
1954
- config: resolveMockConfig(parsed)
1955
- };
1956
- case "vscode":
1957
- case "vscode-insiders":
1958
- return {
1959
- kind: provider,
1960
- name: parsed.name,
1961
- judgeTarget: parsed.judge_target,
1962
- workers: parsed.workers,
1963
- providerBatching,
1964
- config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
1965
- };
1966
- case "cli":
1967
- return {
1968
- kind: "cli",
1969
- name: parsed.name,
1970
- judgeTarget: parsed.judge_target,
1971
- workers: parsed.workers,
1972
- providerBatching,
1973
- config: resolveCliConfig(parsed, env)
1974
- };
1975
- default:
1976
- throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
1977
- }
1978
- }
1979
- function resolveAzureConfig(target, env) {
1980
- const settings = target.settings ?? {};
1981
- const endpointSource = settings.endpoint ?? settings.resource ?? settings.resourceName;
1982
- const apiKeySource = settings.api_key ?? settings.apiKey;
1983
- const deploymentSource = settings.deployment ?? settings.deploymentName ?? settings.model;
1984
- const versionSource = settings.version ?? settings.api_version;
1985
- const temperatureSource = settings.temperature;
1986
- const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
1987
- const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
1988
- const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
1989
- const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
1990
- const version = normalizeAzureApiVersion(
1991
- resolveOptionalString(versionSource, env, `${target.name} api version`)
1992
- );
1993
- const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
1994
- const maxOutputTokens = resolveOptionalNumber(
1995
- maxTokensSource,
1996
- `${target.name} max output tokens`
1997
- );
1998
- return {
1999
- resourceName,
2000
- deploymentName,
2001
- apiKey,
2002
- version,
2003
- temperature,
2004
- maxOutputTokens
2005
- };
2006
- }
2007
- function resolveAnthropicConfig(target, env) {
2008
- const settings = target.settings ?? {};
2009
- const apiKeySource = settings.api_key ?? settings.apiKey;
2010
- const modelSource = settings.model ?? settings.deployment ?? settings.variant;
2011
- const temperatureSource = settings.temperature;
2012
- const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
2013
- const thinkingBudgetSource = settings.thinking_budget ?? settings.thinkingBudget;
2014
- const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
2015
- const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
2016
- return {
2017
- apiKey,
2018
- model,
2019
- temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
2020
- maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
2021
- thinkingBudget: resolveOptionalNumber(thinkingBudgetSource, `${target.name} thinking budget`)
2022
- };
2023
- }
2024
- function resolveGeminiConfig(target, env) {
2025
- const settings = target.settings ?? {};
2026
- const apiKeySource = settings.api_key ?? settings.apiKey;
2027
- const modelSource = settings.model ?? settings.deployment ?? settings.variant;
2028
- const temperatureSource = settings.temperature;
2029
- const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
2030
- const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
2031
- const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
2032
- allowLiteral: true,
2033
- optionalEnv: true
2034
- }) ?? "gemini-2.5-flash";
2035
- return {
2036
- apiKey,
2037
- model,
2038
- temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
2039
- maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
2040
- };
2041
- }
2042
- function resolveCodexConfig(target, env) {
2043
- const settings = target.settings ?? {};
2044
- const executableSource = settings.executable ?? settings.command ?? settings.binary;
2045
- const argsSource = settings.args ?? settings.arguments;
2046
- const cwdSource = settings.cwd;
2047
- const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
2048
- const logDirSource = settings.log_dir ?? settings.logDir ?? settings.log_directory ?? settings.logDirectory;
2049
- const logFormatSource = settings.log_format ?? settings.logFormat ?? settings.log_output_format ?? settings.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
2050
- const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
2051
- allowLiteral: true,
2052
- optionalEnv: true
2053
- }) ?? "codex";
2054
- const args = resolveOptionalStringArray(argsSource, env, `${target.name} codex args`);
2055
- const cwd = resolveOptionalString(cwdSource, env, `${target.name} codex cwd`, {
2056
- allowLiteral: true,
2057
- optionalEnv: true
2058
- });
2059
- const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
2060
- const logDir = resolveOptionalString(logDirSource, env, `${target.name} codex log directory`, {
2061
- allowLiteral: true,
2062
- optionalEnv: true
2063
- });
2064
- const logFormat = normalizeCodexLogFormat(logFormatSource);
2065
- return {
2066
- executable,
2067
- args,
2068
- cwd,
2069
- timeoutMs,
2070
- logDir,
2071
- logFormat
2072
- };
2073
- }
2074
- function normalizeCodexLogFormat(value) {
2075
- if (value === void 0 || value === null) {
2076
- return void 0;
2077
- }
2078
- if (typeof value !== "string") {
2079
- throw new Error("codex log format must be 'summary' or 'json'");
2080
- }
2081
- const normalized = value.trim().toLowerCase();
2082
- if (normalized === "json" || normalized === "summary") {
2083
- return normalized;
2084
- }
2085
- throw new Error("codex log format must be 'summary' or 'json'");
2086
- }
2087
- function resolveMockConfig(target) {
2088
- const settings = target.settings ?? {};
2089
- const response = typeof settings.response === "string" ? settings.response : void 0;
2090
- return { response };
2091
- }
2092
- function resolveVSCodeConfig(target, env, insiders) {
2093
- const settings = target.settings ?? {};
2094
- const workspaceTemplateEnvVar = resolveOptionalLiteralString(settings.workspace_template ?? settings.workspaceTemplate);
2095
- const workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(workspaceTemplateEnvVar, env, `${target.name} workspace template path`, {
2096
- allowLiteral: false,
2097
- optionalEnv: true
2098
- }) : void 0;
2099
- const commandSource = settings.vscode_cmd ?? settings.command;
2100
- const waitSource = settings.wait;
2101
- const dryRunSource = settings.dry_run ?? settings.dryRun;
2102
- const subagentRootSource = settings.subagent_root ?? settings.subagentRoot;
2103
- const defaultCommand = insiders ? "code-insiders" : "code";
2104
- const command = resolveOptionalLiteralString(commandSource) ?? defaultCommand;
2105
- return {
2106
- command,
2107
- waitForResponse: resolveOptionalBoolean(waitSource) ?? true,
2108
- dryRun: resolveOptionalBoolean(dryRunSource) ?? false,
2109
- subagentRoot: resolveOptionalString(subagentRootSource, env, `${target.name} subagent root`, {
2110
- allowLiteral: true,
2111
- optionalEnv: true
2112
- }),
2113
- workspaceTemplate
2114
- };
2115
- }
2116
- function resolveCliConfig(target, env) {
2117
- const settings = target.settings ?? {};
2118
- const commandTemplateSource = settings.command_template ?? settings.commandTemplate;
2119
- const filesFormat = resolveOptionalLiteralString(
2120
- settings.files_format ?? settings.filesFormat ?? settings.attachments_format ?? settings.attachmentsFormat
2121
- );
2122
- const cwd = resolveOptionalString(settings.cwd, env, `${target.name} working directory`, {
2123
- allowLiteral: true,
2124
- optionalEnv: true
2125
- });
2126
- const envOverrides = resolveEnvOverrides(settings.env, env, target.name);
2127
- const timeoutMs = resolveTimeoutMs(settings.timeout_seconds ?? settings.timeoutSeconds, `${target.name} timeout`);
2128
- const healthcheck = resolveCliHealthcheck(settings.healthcheck, env, target.name);
2129
- const commandTemplate = resolveString(
2130
- commandTemplateSource,
2131
- env,
2132
- `${target.name} CLI command template`,
2133
- true
2134
- );
2135
- assertSupportedCliPlaceholders(commandTemplate, `${target.name} CLI command template`);
2136
- return {
2137
- commandTemplate,
2138
- filesFormat,
2139
- cwd,
2140
- env: envOverrides,
2141
- timeoutMs,
2142
- healthcheck
2143
- };
2144
- }
2145
- function resolveEnvOverrides(source, env, targetName) {
2146
- if (source === void 0 || source === null) {
2147
- return void 0;
2148
- }
2149
- if (typeof source !== "object" || Array.isArray(source)) {
2150
- throw new Error(`${targetName} env overrides must be an object map of strings`);
2151
- }
2152
- const entries = Object.entries(source);
2153
- const resolved = {};
2154
- for (const [key, value] of entries) {
2155
- if (typeof value !== "string") {
2156
- throw new Error(`${targetName} env override '${key}' must be a string`);
2157
- }
2158
- const resolvedValue = resolveString(value, env, `${targetName} env override '${key}'`);
2159
- resolved[key] = resolvedValue;
2160
- }
2161
- return Object.keys(resolved).length > 0 ? resolved : void 0;
2162
- }
2163
- function resolveTimeoutMs(source, description) {
2164
- const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
2165
- if (seconds === void 0) {
2166
- return void 0;
2167
- }
2168
- if (seconds <= 0) {
2169
- throw new Error(`${description} must be greater than zero seconds`);
2170
- }
2171
- return Math.floor(seconds * 1e3);
2172
- }
2173
- function resolveCliHealthcheck(source, env, targetName) {
2174
- if (source === void 0 || source === null) {
2175
- return void 0;
2176
- }
2177
- if (typeof source !== "object" || Array.isArray(source)) {
2178
- throw new Error(`${targetName} healthcheck must be an object`);
2179
- }
2180
- const candidate = source;
2181
- const type = candidate.type;
2182
- const timeoutMs = resolveTimeoutMs(
2183
- candidate.timeout_seconds ?? candidate.timeoutSeconds,
2184
- `${targetName} healthcheck timeout`
2185
- );
2186
- if (type === "http") {
2187
- const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
2188
- return {
2189
- type: "http",
2190
- url,
2191
- timeoutMs
2192
- };
2193
- }
2194
- if (type === "command") {
2195
- const commandTemplate = resolveString(
2196
- candidate.command_template ?? candidate.commandTemplate,
2197
- env,
2198
- `${targetName} healthcheck command template`,
2199
- true
2200
- );
2201
- assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
2202
- const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
2203
- allowLiteral: true,
2204
- optionalEnv: true
2205
- });
2206
- return {
2207
- type: "command",
2208
- commandTemplate,
2209
- timeoutMs,
2210
- cwd
2211
- };
2212
- }
2213
- throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
2214
- }
2215
- function assertSupportedCliPlaceholders(template, description) {
2216
- const placeholders = extractCliPlaceholders(template);
2217
- for (const placeholder of placeholders) {
2218
- if (!CLI_PLACEHOLDERS.has(placeholder)) {
2219
- throw new Error(
2220
- `${description} includes unsupported placeholder '{${placeholder}}'. Supported placeholders: ${Array.from(CLI_PLACEHOLDERS).join(", ")}`
2221
- );
2222
- }
2223
- }
2224
- }
2225
- function extractCliPlaceholders(template) {
2226
- const matches = template.matchAll(/\{([A-Z_]+)\}/g);
2227
- const results = [];
2228
- for (const match of matches) {
2229
- if (match[1]) {
2230
- results.push(match[1]);
2231
- }
2232
- }
2233
- return results;
2234
- }
2235
- function resolveString(source, env, description, allowLiteral = false) {
2236
- const value = resolveOptionalString(source, env, description, {
2237
- allowLiteral,
2238
- optionalEnv: false
2239
- });
2240
- if (value === void 0) {
2241
- throw new Error(`${description} is required`);
2242
- }
2243
- return value;
2244
- }
2245
- function resolveOptionalString(source, env, description, options) {
2246
- if (source === void 0 || source === null) {
2247
- return void 0;
2248
- }
2249
- if (typeof source !== "string") {
2250
- throw new Error(`${description} must be a string`);
2251
- }
2252
- const trimmed = source.trim();
2253
- if (trimmed.length === 0) {
2254
- return void 0;
2255
- }
2256
- const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
2257
- if (envVarMatch) {
2258
- const varName = envVarMatch[1];
2259
- const envValue = env[varName];
2260
- if (envValue !== void 0) {
2261
- if (envValue.trim().length === 0) {
2262
- throw new Error(`Environment variable '${varName}' for ${description} is empty`);
2263
- }
2264
- return envValue;
2265
- }
2266
- const optionalEnv = options?.optionalEnv ?? false;
2267
- if (optionalEnv) {
2268
- return void 0;
2269
- }
2270
- throw new Error(`Environment variable '${varName}' required for ${description} is not set`);
2271
- }
2272
- const allowLiteral = options?.allowLiteral ?? false;
2273
- if (!allowLiteral) {
2274
- throw new Error(`${description} must use \${{ VARIABLE_NAME }} syntax for environment variables or be marked as allowing literals`);
2275
- }
2276
- return trimmed;
2277
- }
2278
- function resolveOptionalLiteralString(source) {
2279
- if (source === void 0 || source === null) {
2280
- return void 0;
2281
- }
2282
- if (typeof source !== "string") {
2283
- throw new Error("expected string value");
2284
- }
2285
- const trimmed = source.trim();
2286
- return trimmed.length > 0 ? trimmed : void 0;
2287
- }
2288
- function resolveOptionalNumber(source, description) {
2289
- if (source === void 0 || source === null || source === "") {
2290
- return void 0;
2291
- }
2292
- if (typeof source === "number") {
2293
- return Number.isFinite(source) ? source : void 0;
2294
- }
2295
- if (typeof source === "string") {
2296
- const numeric = Number(source);
2297
- if (Number.isFinite(numeric)) {
2298
- return numeric;
2299
- }
2300
- }
2301
- throw new Error(`${description} must be a number`);
2302
- }
2303
- function resolveOptionalBoolean(source) {
2304
- if (source === void 0 || source === null || source === "") {
2305
- return void 0;
2306
- }
2307
- if (typeof source === "boolean") {
2308
- return source;
2309
- }
2310
- if (typeof source === "string") {
2311
- const lowered = source.trim().toLowerCase();
2312
- if (lowered === "true" || lowered === "1") {
2313
- return true;
2314
- }
2315
- if (lowered === "false" || lowered === "0") {
2316
- return false;
2317
- }
2318
- }
2319
- throw new Error("expected boolean value");
2320
- }
2321
- function resolveOptionalStringArray(source, env, description) {
2322
- if (source === void 0 || source === null) {
2323
- return void 0;
2324
- }
2325
- if (!Array.isArray(source)) {
2326
- throw new Error(`${description} must be an array of strings`);
2327
- }
2328
- if (source.length === 0) {
2329
- return void 0;
2330
- }
2331
- const resolved = [];
2332
- for (let i = 0; i < source.length; i++) {
2333
- const item = source[i];
2334
- if (typeof item !== "string") {
2335
- throw new Error(`${description}[${i}] must be a string`);
2336
- }
2337
- const trimmed = item.trim();
2338
- if (trimmed.length === 0) {
2339
- throw new Error(`${description}[${i}] cannot be empty`);
2340
- }
2341
- const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
2342
- if (envVarMatch) {
2343
- const varName = envVarMatch[1];
2344
- const envValue = env[varName];
2345
- if (envValue !== void 0) {
2346
- if (envValue.trim().length === 0) {
2347
- throw new Error(`Environment variable '${varName}' for ${description}[${i}] is empty`);
2348
- }
2349
- resolved.push(envValue);
2350
- continue;
2351
- }
2352
- throw new Error(`Environment variable '${varName}' for ${description}[${i}] is not set`);
2353
- }
2354
- resolved.push(trimmed);
2355
- }
2356
- return resolved.length > 0 ? resolved : void 0;
2357
- }
2358
-
2359
1884
  // src/evaluation/providers/vscode.ts
2360
1885
  import path5 from "node:path";
2361
1886
  import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
@@ -2918,7 +2443,6 @@ var CodeEvaluator = class {
2918
2443
  expected_outcome: context.evalCase.expected_outcome,
2919
2444
  reference_answer: context.evalCase.reference_answer,
2920
2445
  candidate_answer: context.candidate,
2921
- system_message: context.promptInputs.systemMessage ?? "",
2922
2446
  guideline_paths: context.evalCase.guideline_paths,
2923
2447
  input_files: context.evalCase.file_paths,
2924
2448
  input_segments: context.evalCase.input_segments
@@ -3160,7 +2684,7 @@ function validateConcurrency(concurrency) {
3160
2684
  // src/evaluation/orchestrator.ts
3161
2685
  async function runEvaluation(options) {
3162
2686
  const {
3163
- testFilePath,
2687
+ testFilePath: evalFilePath,
3164
2688
  repoRoot,
3165
2689
  target,
3166
2690
  targets,
@@ -3179,11 +2703,11 @@ async function runEvaluation(options) {
3179
2703
  onProgress
3180
2704
  } = options;
3181
2705
  const load = loadEvalCases;
3182
- const evalCases = await load(testFilePath, repoRoot, { verbose });
2706
+ const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
3183
2707
  const filteredEvalCases = filterEvalCases(evalCases, evalId);
3184
2708
  if (filteredEvalCases.length === 0) {
3185
2709
  if (evalId) {
3186
- throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
2710
+ throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
3187
2711
  }
3188
2712
  return [];
3189
2713
  }
@@ -3562,8 +3086,7 @@ async function evaluateCandidate(options) {
3562
3086
  const rawRequest = {
3563
3087
  question: promptInputs.question,
3564
3088
  ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3565
- guideline_paths: evalCase.guideline_paths,
3566
- system_message: promptInputs.systemMessage ?? ""
3089
+ guideline_paths: evalCase.guideline_paths
3567
3090
  };
3568
3091
  return {
3569
3092
  eval_id: evalCase.id,
@@ -3827,7 +3350,6 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
3827
3350
  question: promptInputs.question,
3828
3351
  ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3829
3352
  guideline_paths: evalCase.guideline_paths,
3830
- system_message: promptInputs.systemMessage ?? "",
3831
3353
  error: message
3832
3354
  };
3833
3355
  return {